Identifying heterogeneous treatment effects for online single-session interventions for adolescent depression: a secondary analysis

Note: Press the button on the upper right of this file to show all codes.

Report: Download PART_III.pdf. Git repo can be found here

1 Data preparation

First, load dataset and clean data as necessary.

## read and create subsets
cope_subset <- read_csv("data/cope_subset.csv")
cope_subset %>% datatable(
  rownames = FALSE,
  options = list(
    pageLength = 5,
    columnDefs = list(list(className = 'dt-center', 
                      targets = 0:4))))
## gender identity subset
dem_gender_subset <- cope_subset %>% dplyr::select(starts_with("b_dem_gender"))

## coping strategies

dem_cope_subset <- cope_subset %>% dplyr::select(starts_with("b_covid_cope"))

## family challenges

dem_family_subset <- cope_subset %>% dplyr::select(starts_with("b_covid_family"))

## race

dem_race_subset <- cope_subset %>% dplyr::select(starts_with("b_dem_race"))

## sexual orientation

dem_orient_subset <- cope_subset %>% dplyr::select(b_dem_orientation)

## coping strategy subset
dem_cope_subset <- dem_cope_subset %>%
  dplyr::select(-"b_covid_cope_1_including_talking_with_people_you_trust_about_your_concerns_and_how_you_are_feeling")

1.1 Collapse categories

Re-code sexual orientation, race, family challenge variables by collapsing some categories with limited number of samples, and generating composite categories.

1.1.0.1 Recode sexual orientation

## recode sexual orientation
recode_orient_dat <- dem_orient_subset %>%
mutate(recod_orient= case_when(
  b_dem_orientation %in% c("Asexual", "I do not use a label", "Other/Not listed (please specify)") ~ "Other",
  b_dem_orientation %in% c("Gay/Lesbian/Homosexual", "Queer", "Unsure/Questioning") ~ "LGBTQ",
  b_dem_orientation %in% c("Bisexual", "Pansexual") ~ "LGBTQ",
  b_dem_orientation == "Heterosexual/Straight" ~ "Heterosexual",
  TRUE ~ b_dem_orientation
))

1.1.0.2 Recode family challenges

## recode family challenge
### number of challenges
challenge_num_dat <- dem_family_subset %>%
  mutate(challenge_num = apply(., 1, sum)) %>%
  mutate(challenge_num = ifelse(b_covid_family_the_covid_19_pandemic_has_not_affected_me_or_my_family_in_these_ways_in_the_past_2_weeks=="1",0,challenge_num)) %>%
  mutate(challenge_num3 = ifelse(challenge_num>=2, 2, challenge_num))

### category of challenges
challenge_cat_dat <-  dem_family_subset %>%
  mutate(challenge_cat = case_when(
    b_covid_family_family_did_not_enough_enough_money_for_food == 1 |
      b_covid_family_family_did_not_have_enough_money_for_gas_transportation == 1 |
      b_covid_family_family_did_not_have_a_regular_place_to_sleep_or_stay==1 |
      b_covid_family_family_did_not_have_enough_money_to_pay_rent == 1 ~ "Financial",

    b_covid_family_i_could_not_attend_school_in_person == 1 |
      b_covid_family_i_could_not_attend_school_at_all == 1 ~ "School",

    b_covid_family_other == 1 ~ "Other",

    b_covid_family_the_covid_19_pandemic_has_not_affected_me_or_my_family_in_these_ways_in_the_past_2_weeks == 1 ~ "No impact",

    TRUE ~ "No"
  ))


challenge_cat_dat <- dem_family_subset %>%
  mutate(
    Financial = b_covid_family_family_did_not_enough_enough_money_for_food == 1 |
      b_covid_family_family_did_not_have_enough_money_for_gas_transportation == 1 |
      b_covid_family_family_did_not_have_a_regular_place_to_sleep_or_stay == 1 |
      b_covid_family_family_did_not_have_enough_money_to_pay_rent == 1,

    School = b_covid_family_i_could_not_attend_school_in_person == 1 |
      b_covid_family_i_could_not_attend_school_at_all == 1,

    Other = b_covid_family_other == 1,

    No_impact = b_covid_family_the_covid_19_pandemic_has_not_affected_me_or_my_family_in_these_ways_in_the_past_2_weeks == 1,

    challenge_cat = case_when(
      Financial & School & Other ~ "Other",
      Financial & School ~ "Other",
      Financial & Other ~ "Other",
      School & Other ~ "Other",
      Financial ~ "Other",
      School ~ "School",
      Other ~ "Other",
      No_impact ~ "No impact",
      TRUE ~ "No challenge"
    )
  ) %>%
  dplyr::select(-Financial, -School, -Other, -No_impact)

1.1.0.3 Recode race

## identify all race patterns
race_pattern_dat <- cope_subset %>%
  dplyr::select(starts_with("b_dem_race")) %>%
  mutate(race_pattern = apply(., 1, function(row) paste0(row, collapse = "")))

cope_subset1 <- cope_subset %>%
  mutate(race_pattern =race_pattern_dat$race_pattern ) %>%
  ## recode race
  mutate(recode_race = case_when(
    race_pattern == "000010" ~ 'White',
    race_pattern == "000001" ~ 'Black/African-American',
    race_pattern == "010000" ~ 'Asian Including Asian Desi',
    race_pattern == "001000" ~ 'Hispanic/Latinx',
    race_pattern == "000000" ~ 'Prefer not to answer',
    TRUE ~ 'Mixed'
  )) %>%
  ## recode sexual orientation
  mutate(recode_orient = recode_orient_dat$recod_orient) %>% 
  
  dplyr::select(-race_pattern) %>%
  ## recode language
  mutate(recode_language = case_when(
    b_dem_language == "English" ~ 'English',
    TRUE ~ 'Other'
  )) %>%
  ## reorder variables
  dplyr::select(-b_dem_language,-starts_with("b_dem_race")) %>%
  dplyr::select(recode_race,starts_with("b_dem_gender"),b_screener_age,b_dem_sex,b_dem_orientation,
         starts_with("b_covid_family"), starts_with("b_covid_cope"),recode_language, everything()
         )

1.2 Latent Class Analysis(LCA)

Apply LCA to find the latent class variables for gender identity and coping strategies, respectively.

1.2.1 Gender identity

## Gender identity
tmp <- dem_gender_subset%>%
  mutate(gender_pattern = apply(., 1, function(row) paste0(row, collapse = "")))

## check patterns
tmp1 <- tmp$gender_pattern %>% table() %>% data.frame()

genderid_dat <- dem_gender_subset

## check correlation
genderid_dat_cor <- cor(genderid_dat) %>% data.frame


# The transgender indicator and gender_expansive indicator variables are removed because these two variables are the 
# composite variables of other indicators


### correlation matrix 
#genderid_dat_cor_melted <- melt(as.matrix(genderid_dat_cor))


# ggplot(data = genderid_dat_cor_melted, aes(x=Var1, y=Var2, fill=value)) +
#   geom_tile(color = "white") +
#   scale_fill_gradient2(low = "blue", high = "red", mid = "white",
#                        midpoint = 0, limit = c(-1, 1), space = "Lab",
#                        name="Correlation") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, vjust = 1,
#                                    size = 12, hjust = 1)) +
#   coord_fixed() +
#   labs(x = "", y = "") +
#   ggtitle("Correlation Matrix Heatmap")

## prepare input data for poLCA

## only accept indicator variables coded start from 1 (transfer 01 --> 12)
recode_variables <- function(df) {
  df <- df %>% mutate(across(everything(), ~ as.integer(as.factor(.))))
  return(df)
}


## input of the poLCA
genderid_dat_LCA <- genderid_dat %>%
  recode_variables() %>%
  rename_with(~ str_remove(., "b_dem_gender_"), starts_with("b_dem_gender_"))

## define the indicator variables of LCA model
f1 <- cbind(
  agender, not_sure, other_please_specify,
  androgynous, nonbinary, two_spirited,
  female_to_male_transgender_ftm, trans_female_trans_feminine,
  trans_male_trans_masculine,
  # gender_expansive, ## composite removed
  third_gender, genderqueer, male_to_female_transgender_mtf,
  man_boy,
  # transgender, ## composite removed
  woman_girl) ~ 1

set.seed(1017)
gender_LCA2 <- poLCA(f1, data = genderid_dat_LCA,
              nclass = 2)


gender_LCA3 <- poLCA(f1, data = genderid_dat_LCA,
              nclass = 3)

gender_LCA4 <- poLCA(f1, data = genderid_dat_LCA,
              nclass = 4)


gender_LCA5 <- poLCA(f1, data = genderid_dat_LCA,
              nclass = 5)


gender_models <- list(gender_LCA2, gender_LCA3, gender_LCA4, gender_LCA5)
gender_model_stats <- data.frame(
  Model = 2:5,
  G2 = sapply(gender_models, function(x) x$Gsq),
  AIC = sapply(gender_models, function(x) x$aic),
  BIC = sapply(gender_models, function(x) x$bic)
)

1.2.1.1 Step 1: LCA model selection

Compare LCA models for different number of classes.

the LCA model with 3 latent classes has comparable \(G^2\), AIC, BIC. (could also use scree plot). therefore, a model with 3 latent class is selected for gender identity.

kable(gender_model_stats, caption = "Model Comparison for Different Number of Classes (Gender Identity variable)")
Table 1.1: Model Comparison for Different Number of Classes (Gender Identity variable)
Model G2 AIC BIC
2 1348.0556 7529.195 7683.297
3 924.7536 7135.893 7369.704
4 821.6347 7062.774 7376.293
5 616.5568 6887.696 7280.923

1.2.1.2 Step 2: classification error: averaged posterior probability(APP)

print APP, results suggest low classification error(APP>0.7).

posterior_probs <- gender_LCA3$posterior

assigned_class <- apply(posterior_probs, 1, which.max)

mean_posterior_by_class <- numeric(ncol(posterior_probs))

for (k in 1:ncol(posterior_probs)) {
  mean_posterior_by_class[k] <- mean(posterior_probs[assigned_class == k, k])
}

print(mean_posterior_by_class)
## [1] 0.9648380 0.9863283 0.9650341

1.2.1.3 Step 3: Final latent class probabilities

The probability of belonging to a specific class is calculated for each individual, and each individual is classified to a specific class based on the max. posterior probability. Below is the results of the posterior probabilities for all subjects (prob. of being in class \(j\)).

## get predicted class (gender identity)
genderid_class <- gender_LCA3$predclass
genderid_predprob <- gender_LCA3$posterior
colnames(genderid_predprob) <- paste0("Class",c(1:3))
genderid_predprob %>% datatable(
  rownames = FALSE,
  options = list(
     pageLength = 5,
    columnDefs = list(list(className = 'dt-center', 
                      targets = 0:2))))

1.2.1.4 Step 4: Assign label to each latent class (gener identity)

Visualize the probabilities of answering yes of each item by latent class (Pr(individual answers yes to an item \(|\) in class \(j\))) to understand the underlying pattern.

# probabilities of answering yes by class
probs_list <- gender_LCA3$probs

probs_df <- do.call(rbind, lapply(names(probs_list), function(var) {
  prob_df <- as.data.frame(probs_list[[var]])
  prob_df$variable <- var
  prob_df$class <- rownames(prob_df)
  prob_df
})) %>% dplyr::select(-`Pr(1)`)

probs_df_list <- list()


for (var in names(probs_list)) {
  prob_df <- as.data.frame(probs_list[[var]])
  prob_df$variable <- var
  prob_df$class <- rownames(prob_df)
  probs_df_list[[var]] <- prob_df
}


probs_df_combined <- bind_rows(probs_df_list)%>%
  dplyr::select(-`Pr(1)`) %>% 
  mutate(variable = dplyr::recode(variable,
    "agender" = "agender",
    "not_sure" = "not sure",
    "other_please_specify" = "other",
    "androgynous" = "androgynous",
    "nonbinary" = "nonbinary",
    "two_spirited" = "two spirited",
    "female_to_male_transgender_ftm" = "female to male",
    "trans_female_trans_feminine" = "female trans feminine",
    "trans_male_trans_masculine" = "male trans masculine",
    "third_gender" = "third gender",
    "genderqueer" = "genderqueer",
    "male_to_female_transgender_mtf" = "male to female",
    "man_boy" = "man/boy",
    "woman_girl" = "woman/girl"
  )) %>% 
   mutate(class = dplyr::recode(class,
                                "class 1: " = "Class 1: Non-binary",
                                "class 2: " = "Class 2: Women/girls",
                                "class 3: " = "Class 3: Male/Masculine")
   ) %>% 
  group_by(variable) %>% 
  mutate(maxprob = max(`Pr(2)`)) %>%
  ungroup() %>% 
  mutate(maxprob = ifelse(maxprob==`Pr(2)`,maxprob,NA) %>% round(2),
         maxprob = ifelse(maxprob %in% c(0.01,0.02,0.03,0.12,0.23,0.2,0.19,0.09,0.38),NA,maxprob)
         )

## the condit. prob
gender_prob_plot <- ggplot(probs_df_combined, aes(x = variable, y = `Pr(2)`, color = class, group = class)) +
  geom_line(size = 1) + 
  geom_point(size = 1.5)+
  geom_text(aes(label = round(maxprob, 2)), vjust = -1, size = 3, show.legend = FALSE) +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12)) +  
  labs(x = "", y = "Probability of answering yes ") +
   ggtitle("(A) Latent classes for gender identity")+
  scale_color_brewer(palette = "Set1") +  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),  
    panel.grid.minor.x = element_blank() 
  )

gender_prob_plot

ggsave("figures/gender_prob_plot.png", width = 6, height = 6, dpi = 600)

1.2.2 Coping strategies

1.2.2.1 Step 1: LCA model selection

Build and compare LCA models

cope_dat_LCA <- dem_cope_subset %>%
  recode_variables() %>%
  rename_with(~ str_remove(., "b_covid_cope_1_"), starts_with("b_covid_cope_1_"))


f2 <- cbind(connecting_with_others, contacting_a_healthcare_provider,
        drinking_alcohol, smoking_more_cigarettes_or_vaping_more) ~ 1

set.seed(1017)
cope_LCA2 <- poLCA(f2, data = cope_dat_LCA,
              nclass = 2)
cope_LCA3 <- poLCA(f2, data = cope_dat_LCA,
              nclass = 3)

cope_LCA4 <- poLCA(f2, data = cope_dat_LCA,
              nclass = 4)
cope_LCA5 <- poLCA(f2, data = cope_dat_LCA,
              nclass = 5)

cope_models <- list(cope_LCA2, cope_LCA3, cope_LCA4, cope_LCA5)
cope_model_stats <- data.frame(
  Model = 2:5,
  G2 = sapply(cope_models, function(x) x$Gsq),
  AIC = sapply(cope_models, function(x) x$aic),
  BIC = sapply(cope_models, function(x) x$bic)
)
kable(cope_model_stats, caption = "Model Comparison for Different Number of Classes (Coping strategy variable)")
Table 1.2: Model Comparison for Different Number of Classes (Coping strategy variable)
Model G2 AIC BIC
2 29.066630 4548.577 4596.402
3 6.162343 4535.673 4610.067
4 6.058262 4545.569 4646.533
5 1.878940 4551.389 4678.923

1.2.2.2 Step 2: Classification error: averaged posterior probability(APP)

print averaged posterior probability(APP) for coping strategy, results suggest low classification error(APP>0.7).

posterior_probs <- cope_LCA3$posterior

assigned_class <- apply(posterior_probs, 1, which.max)

mean_posterior_by_class <- numeric(ncol(posterior_probs))

for (k in 1:ncol(posterior_probs)) {
  mean_posterior_by_class[k] <- mean(posterior_probs[assigned_class == k, k])
}

print(mean_posterior_by_class)
## [1] 0.8685438 0.7917650 0.9089532

1.2.2.3 Step 3: Final latent class probabilities

Similarly, get the predicted posterior probability of the latent variable for coping strategy.

## get predicted class (gender identity)
cope_class <- cope_LCA3$predclass
cope_predprob <- cope_LCA3$posterior
colnames(cope_predprob) <- paste0("Class",c(1:3))
cope_predprob %>% datatable(
  rownames = FALSE,
  options = list(
     pageLength = 5,
    columnDefs = list(list(className = 'dt-center', 
                      targets = 0:2))))

1.2.2.4 Step 4: Assign label to each latent class (coping strategies)

probs_list <- cope_LCA3$probs

probs_df <- do.call(rbind, lapply(names(probs_list), function(var) {
  prob_df <- as.data.frame(probs_list[[var]])
  prob_df$variable <- var
  prob_df$class <- rownames(prob_df)
  prob_df
})) %>% dplyr::select(-`Pr(1)`)

probs_df_list <- list()

for (var in names(probs_list)) {
  prob_df <- as.data.frame(probs_list[[var]])
  prob_df$variable <- var
  prob_df$class <- rownames(prob_df)
  probs_df_list[[var]] <- prob_df
}


probs_df_combined <- bind_rows(probs_df_list)%>%
  dplyr::select(-`Pr(1)`) %>% 
   mutate(variable = dplyr::recode(variable,
    "connecting_with_others" = "connect with others",
                      "contacting_a_healthcare_provider" = "contact healthcare provider",
                      "drinking_alcohol" = "drink alcohol",
                      "smoking_more_cigarettes_or_vaping_more" = "smoke/vape more")) %>% 
   mutate(class = dplyr::recode(class,
                                "class 3: " = "Class 3: Positive ",
                                "class 2: " = "Class 2: No action",
                                "class 1: " = "Class 1: Combined")
   ) %>% 
  group_by(variable) %>% 
  mutate(maxprob = max(`Pr(2)`)) %>%
  ungroup() %>% 
  mutate(maxprob = ifelse(maxprob==`Pr(2)`,maxprob,NA) %>% round(2),
         maxprob = ifelse(maxprob==0.16,NA,maxprob))

cope_prob_plot <- ggplot(probs_df_combined, aes(x = variable, y = `Pr(2)`, color = class, group = class)) +
  geom_line(size = 1) + 
  geom_point(size = 1.5)+
  geom_text(aes(label = round(maxprob, 2)), vjust = -1, size = 3, show.legend = FALSE) +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12)) + 
  labs(x = "", y = "Probability of answering yes ") +
   ggtitle("(B) Latent classes for coping strategies")+
  scale_color_brewer(palette = "Set1") +  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),  
    panel.grid.minor.x = element_blank() 
  )

cope_prob_plot

ggsave("figures/cope_prob_plot.png", width = 6, height = 6, dpi = 600)

1.3 Missing values

98.6% of the subjects have complete information.

miss_dat <- cope_subset1 %>%

  mutate(across(starts_with("recode_race"), ~ ifelse(. == "Prefer not to answer", NA, .))) %>%

  mutate(b_dem_sex = ifelse(b_dem_sex %in% c("Prefer not to say", "Other"), NA, b_dem_sex)) %>%
  
  mutate(across(starts_with("recode_orient"), ~ ifelse(. == "I do not want to respond", NA, .)))%>% dplyr::select(b_dem_sex,recode_orient,recode_race)

colnames(miss_dat) <- c("Biological sex","Sexual orientation","Race")

vis_miss(miss_dat)

Tabulate the number and percentage of missing. The missing rate is low, so complete case analysis will be used later.

missing_summary <- miss_var_summary(miss_dat)

miss_plot <- ggplot(missing_summary, aes(x = reorder(variable, -n_miss), y = n_miss)) +
  geom_bar(stat = "identity", fill = "lightblue",size=0.1) +
  geom_text(aes(label = paste0(n_miss, " (", round(pct_miss, 2), "%)")), 
            hjust = 1,  vjust = 0.5, color = "black") +
  coord_flip() +
  theme_minimal() +
  labs( #title = "Missing Data by Variable",
       x = "Variable",
       y = "Missing Count (Percentage)") +
  theme(axis.text.y = element_text(size = 10),
        axis.title = element_text(size = 12),
        plot.title = element_text(size = 14, face = "bold")) +
 # coord_cartesian(ylim = c(0, 40))+
  theme_minimal()
miss_plot 

Check if there is any missing pattern among missing variables: No pattern presents!

gg_miss_upset(miss_dat)

1.4 The final working dataset

After some investigation, I decided to do a complete case analysis(CCA), the final working dataset is then generated.

dat_cc <- cope_subset3 %>% 
   filter(recode_race!="Prefer not to answer") %>%  ## 1468
  filter(b_dem_sex!="Prefer not to say" & b_dem_sex!="Other") %>% ## 1447
  filter(orientation!="I do not want to respond") ## 1441
cope_subset2 <- cope_subset1 %>%

  mutate(across(starts_with("recode_race"), ~ ifelse(. == "Prefer not to answer", NA, .))) %>%

  mutate(b_dem_sex = ifelse(b_dem_sex %in% c("Prefer not to say", "Other"), NA, b_dem_sex)) %>%
  
  mutate(across(starts_with("recode_orient"), ~ ifelse(. == "I do not want to respond", NA, .)))

## use the latent variable genderid3 as a surragate for a series of gener identity related quesitons
cope_subset2$genderid3 <- genderid_class

## use the latent variable cope3 as a surragate for a series of coping strategies related quesitons
cope_subset2$cope3 <- cope_class

cope_subset3 <- cope_subset2 %>%
  mutate(family_num =challenge_num_dat$challenge_num3,
         family_cat = challenge_cat_dat$challenge_cat) %>%
  dplyr::select(c("b_response_id", "condition", "b_cdi_mean", "f1_cdi_mean", "recode_race",
           #"b_dem_gender_agender", "b_dem_gender_not_sure",
           #"b_dem_gender_other_please_specify", "b_dem_gender_androgynous",
           #"b_dem_gender_nonbinary", "b_dem_gender_two_spirited", "b_dem_gender_female_to_male_transgender_ftm",
           #"b_dem_gender_trans_female_trans_feminine", "b_dem_gender_trans_male_trans_masculine",
           #"b_dem_gender_gender_expansive", "b_dem_gender_third_gender",
           #"b_dem_gender_genderqueer", "b_dem_gender_male_to_female_transgender_mtf",
          # "b_dem_gender_man_boy", "b_dem_gender_transgender", "b_dem_gender_woman_girl",
           "b_screener_age", "b_dem_sex", "recode_orient",
            #"b_covid_family_family_did_not_enough_enough_money_for_food",
           #"b_covid_family_family_did_not_have_a_regular_place_to_sleep_or_stay",
           #"b_covid_family_i_could_not_attend_school_in_person", "b_covid_family_i_could_not_attend_school_at_all",
           #"b_covid_family_other", "b_covid_family_family_did_not_have_enough_money_for_gas_transportation",
           #"b_covid_family_family_did_not_have_enough_money_to_pay_rent",
           #"b_covid_family_the_covid_19_pandemic_has_not_affected_me_or_my_family_in_these_ways_in_the_past_2_weeks",
           #"b_covid_cope_1_connecting_with_others", "b_covid_cope_1_including_talking_with_people_you_trust_about_your_concerns_and_how_you_are_feeling",
           #"b_covid_cope_1_contacting_a_healthcare_provider", "b_covid_cope_1_drinking_alcohol",
           #"b_covid_cope_1_smoking_more_cigarettes_or_vaping_more",
           "recode_language",
           "genderid3","family_num","family_cat",
           # "family3",
          "cope3")) %>% 
  mutate(
   # condition = factor(condition),
    recode_race = factor(recode_race),
   # b_screener_age = factor(b_screener_age),
    b_dem_sex = factor(b_dem_sex),
    recode_orient = factor(recode_orient),
    genderid3 = factor(genderid3),
    family_num = factor(family_num),
    family_cat = factor(family_cat),
    cope3 = factor(cope3)
  )

dat_cc <- cope_subset3 %>% 
   filter(!is.na(recode_race)) %>%  ## 1468
  filter(!is.na(b_dem_sex)) %>% ## 1447
  filter(!is.na(recode_orient) )## 1441


dat_cc %>%  datatable(
  rownames = FALSE,
  options = list(
    pageLength = 5))
save(dat_cc,file="data/dat_cc.rds")
table1_dat <- cope_subset3 %>% 

 mutate(genderid3 = dplyr::recode(genderid3,
                                "1" = "Non-binary",
                                "2" = "Women/girls",
                                "3" = "Male/Masculine")
   ) %>% 
     mutate(family_num = dplyr::recode(family_num,
                                "0" = "0",
                                "2" = ">=2",
                                "1" = "1")
   )  %>% 
  mutate(cope3 = dplyr::recode(cope3,
                                "3" = "Positive",
                                "2" = "No action",
                                "1" = "Combined")
   )  %>% 
   filter(!is.na(recode_race)) %>%  ## 1468
  filter(!is.na(b_dem_sex)) %>% ## 1447
  filter(!is.na(recode_orient) )## 1441
  

colnames(table1_dat) <- c("id", "condition", "Baseline CDI mean score(0-2)", "3-month CDI mean score", 
"Race", "Age (yrs)", "Biological sex", "Sexual orientation", 
"Language", "Gender identity", "Number of challenges", "Type of challenges", "Type of coping strategies"
)

2 Descriptive table (table 1)

A Descriptive table is generated using the 1441 subjects with complete data. Summary stats is stratified by treatment condition.

table1_cc <- tbl_summary(table1_dat %>% dplyr::select(-id,-`3-month CDI mean score`),
                      by = condition, statistic = list(all_continuous() ~ "{mean} ({sd})",
                                                                        all_categorical() ~ "{n} ({p}%)"), digits = all_continuous() ~ 2,
                      # label = list(
                      #              b_dem_sex ~ "Biological Sex",
                      #              b_dem_orientation ~ "Sexual Orientation"
                      #              #,
                      #              # b_cdi_sum ~ "Baseline CDI Sum Score (0-24)"
                      #              )
                      ) %>%
  modify_header(label ~ "**Demographics**") %>%
  modify_spanning_header(c("stat_1", "stat_2","stat_3") ~ "**Treatment Received**")
table1_cc
Demographics Treatment Received
Placebo Control
N = 488
1
Project ABC
N = 489
1
Project Personality
N = 464
1
Baseline CDI mean score(0-2) 1.16 (0.35) 1.15 (0.34) 1.17 (0.36)
Race


    Asian Including Asian Desi 50 (10%) 58 (12%) 50 (11%)
    Black/African-American 33 (6.8%) 40 (8.2%) 36 (7.8%)
    Hispanic/Latinx 57 (12%) 61 (12%) 53 (11%)
    Mixed 74 (15%) 68 (14%) 63 (14%)
    White 274 (56%) 262 (54%) 262 (56%)
Age (yrs)


    13 28 (5.7%) 32 (6.5%) 28 (6.0%)
    14 77 (16%) 81 (17%) 63 (14%)
    15 150 (31%) 156 (32%) 162 (35%)
    16 233 (48%) 220 (45%) 211 (45%)
Biological sex


    Female 434 (89%) 437 (89%) 418 (90%)
    Male 54 (11%) 52 (11%) 46 (9.9%)
Sexual orientation


    Heterosexual 108 (22%) 97 (20%) 106 (23%)
    LGBTQ 309 (63%) 327 (67%) 291 (63%)
    Other 71 (15%) 65 (13%) 67 (14%)
Language


    English 476 (98%) 476 (97%) 450 (97%)
    Other 12 (2.5%) 13 (2.7%) 14 (3.0%)
Gender identity


    Non-binary 102 (21%) 98 (20%) 86 (19%)
    Women/girls 313 (64%) 325 (66%) 306 (66%)
    Male/Masculine 73 (15%) 66 (13%) 72 (16%)
Number of challenges


    0 98 (20%) 99 (20%) 85 (18%)
    1 276 (57%) 297 (61%) 275 (59%)
    >=2 114 (23%) 93 (19%) 104 (22%)
Type of challenges


    No impact 98 (20%) 99 (20%) 85 (18%)
    Other 119 (24%) 97 (20%) 124 (27%)
    School 271 (56%) 293 (60%) 255 (55%)
Type of coping strategies


    Combined 58 (12%) 61 (12%) 53 (11%)
    No action 228 (47%) 235 (48%) 236 (51%)
    Positive 202 (41%) 193 (39%) 175 (38%)
1 Mean (SD); n (%)
## write tex file
table1_df <- as.data.frame(table1_cc)
latex_table1 <- xtable(table1_df)

writeLines(print(latex_table1, type = "latex",include.rownames = FALSE), paste0("table1_df_",Sys.Date(),".tex"))
## % latex table generated in R 4.3.1 by xtable 1.8-4 package
## % Fri Aug  2 16:23:19 2024
## \begin{table}[ht]
## \centering
## \begin{tabular}{llll}
##   \hline
## **Demographics** & **Placebo Control**  
## N = 488 & **Project ABC**  
## N = 489 & **Project Personality**  
## N = 464 \\ 
##   \hline
## Baseline CDI mean score(0-2) & 1.16 (0.35) & 1.15 (0.34) & 1.17 (0.36) \\ 
##   Race &  &  &  \\ 
##   Asian Including Asian Desi & 50 (10\%) & 58 (12\%) & 50 (11\%) \\ 
##   Black/African-American & 33 (6.8\%) & 40 (8.2\%) & 36 (7.8\%) \\ 
##   Hispanic/Latinx & 57 (12\%) & 61 (12\%) & 53 (11\%) \\ 
##   Mixed & 74 (15\%) & 68 (14\%) & 63 (14\%) \\ 
##   White & 274 (56\%) & 262 (54\%) & 262 (56\%) \\ 
##   Age (yrs) &  &  &  \\ 
##   13 & 28 (5.7\%) & 32 (6.5\%) & 28 (6.0\%) \\ 
##   14 & 77 (16\%) & 81 (17\%) & 63 (14\%) \\ 
##   15 & 150 (31\%) & 156 (32\%) & 162 (35\%) \\ 
##   16 & 233 (48\%) & 220 (45\%) & 211 (45\%) \\ 
##   Biological sex &  &  &  \\ 
##   Female & 434 (89\%) & 437 (89\%) & 418 (90\%) \\ 
##   Male & 54 (11\%) & 52 (11\%) & 46 (9.9\%) \\ 
##   Sexual orientation &  &  &  \\ 
##   Heterosexual & 108 (22\%) & 97 (20\%) & 106 (23\%) \\ 
##   LGBTQ & 309 (63\%) & 327 (67\%) & 291 (63\%) \\ 
##   Other & 71 (15\%) & 65 (13\%) & 67 (14\%) \\ 
##   Language &  &  &  \\ 
##   English & 476 (98\%) & 476 (97\%) & 450 (97\%) \\ 
##   Other & 12 (2.5\%) & 13 (2.7\%) & 14 (3.0\%) \\ 
##   Gender identity &  &  &  \\ 
##   Non-binary & 102 (21\%) & 98 (20\%) & 86 (19\%) \\ 
##   Women/girls & 313 (64\%) & 325 (66\%) & 306 (66\%) \\ 
##   Male/Masculine & 73 (15\%) & 66 (13\%) & 72 (16\%) \\ 
##   Number of challenges &  &  &  \\ 
##   0 & 98 (20\%) & 99 (20\%) & 85 (18\%) \\ 
##   1 & 276 (57\%) & 297 (61\%) & 275 (59\%) \\ 
##   $>$=2 & 114 (23\%) & 93 (19\%) & 104 (22\%) \\ 
##   Type of challenges &  &  &  \\ 
##   No impact & 98 (20\%) & 99 (20\%) & 85 (18\%) \\ 
##   Other & 119 (24\%) & 97 (20\%) & 124 (27\%) \\ 
##   School & 271 (56\%) & 293 (60\%) & 255 (55\%) \\ 
##   Type of coping strategies &  &  &  \\ 
##   Combined & 58 (12\%) & 61 (12\%) & 53 (11\%) \\ 
##   No action & 228 (47\%) & 235 (48\%) & 236 (51\%) \\ 
##   Positive & 202 (41\%) & 193 (39\%) & 175 (38\%) \\ 
##    \hline
## \end{tabular}
## \end{table}
age_summary <- table1_dat %>%
  mutate(`Age (yrs)` = as.numeric(`Age (yrs)`)) %>% 
  group_by(condition) %>%
  summarise(
    count = n(),
    mean_age = mean(`Age (yrs)`, na.rm = TRUE),
    sd_age = sd(`Age (yrs)`, na.rm = TRUE),
    min_age = min(`Age (yrs)`, na.rm = TRUE),
    max_age = max(`Age (yrs)`, na.rm = TRUE),
    median_age = median(`Age (yrs)`),
    IQR_age = IQR(`Age (yrs)`, na.rm = TRUE)
  )

3 Baseline CDI Prediction Model

Will use risk-based approach.

The first step is to construct the baseline CDI prediction model. The focus of the “risk” prediction model is on accurately predicting individuals’ “disease” risk. Several considerations need to be taken into account:

Statistical Model Selection:

Consideration of models: will use linear model (simple and interpretability)

Performance Metrics:

For continuous outcomes, models will be evaluated and selected based on root mean squared error (RMSE), calibration slope, and calibration-at-large.

Overfitting:

Employ a leave-one-out cross-validation (LOOCV) framework to address overfitting. LOOCV will be conducted on 80% of the samples (derivation cohort), while the remaining 20% will serve as a test cohort to mimic an external validation. Final model will be constructed on the entire dataset.

## subset data
dat_abc <- dat_cc %>% dplyr::select(-family_num,-recode_language) %>%  ## removed because of corr.
  filter(condition!="Project Personality") %>%## ABC vs control
  mutate(condition = as.character(condition),
         condition = as.factor(condition)
  )
dat_person <- dat_cc %>% dplyr::select(-family_num,-recode_language) %>% 
  filter(condition!="Project ABC") %>%  ## Personality vs control
 mutate(condition = as.character(condition),
        condition = as.factor(condition)
        )

a glimpse of the variables:

str(dat_abc)
## tibble [977 × 11] (S3: tbl_df/tbl/data.frame)
##  $ b_response_id : chr [1:977] "R_02uyQw7i0v8yQRX" "R_089oJuXofDsITq9" "R_08lyOxzUAzbJqcV" "R_0lDh6r7xrSRAemR" ...
##  $ condition     : Factor w/ 2 levels "Placebo Control",..: 1 1 1 2 1 1 1 2 2 2 ...
##  $ b_cdi_mean    : num [1:977] 1.417 1.167 1.333 1 0.917 ...
##  $ f1_cdi_mean   : num [1:977] 1 0.583 1.417 0.833 0.583 ...
##  $ recode_race   : Factor w/ 5 levels "Asian Including Asian Desi",..: 5 5 5 1 5 4 5 5 5 5 ...
##  $ b_screener_age: num [1:977] 14 16 15 14 16 14 14 16 16 16 ...
##  $ b_dem_sex     : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ recode_orient : Factor w/ 3 levels "Heterosexual",..: 2 2 2 1 1 2 1 1 2 2 ...
##  $ genderid3     : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 2 2 2 2 2 ...
##  $ family_cat    : Factor w/ 3 levels "No impact","Other",..: 2 1 3 3 3 2 3 2 3 2 ...
##  $ cope3         : Factor w/ 3 levels "1","2","3": 3 3 2 2 2 2 2 2 2 2 ...

3.1 Build prediction model

Build prediction models using caret

basemodel_dat <- dat_abc %>% dplyr::select(-condition,-b_cdi_mean,-b_response_id,-b_screener_age,)
set.seed(1017)

trainIndex <- createDataPartition(basemodel_dat$f1_cdi_mean, p = .8, list = FALSE, times = 1)
trainData <- basemodel_dat[trainIndex, ]
testData <- basemodel_dat[-trainIndex, ]



train_control <- trainControl(method = "LOOCV")

#train_control <- trainControl(method = "cv", number = 3)

# rf_model <- train(f1_cdi_mean ~ ., 
#                   data = trainData, 
#                   method = "rf", 
#                   trControl = train_control)
tune_grid <- expand.grid(alpha = 0,
                         lambda = 10^seq(-3, 1, length = 10))

abc_model <- train(f1_cdi_mean ~ ., 
                  data = trainData, 
                  method = "glmnet", 
                  trControl = train_control,
                  tuneGrid = tune_grid,
                  metric = "RMSE")

valid_pred <- predict(abc_model, newdata = trainData)

test_pred <- predict(abc_model, newdata = testData)

## mse
rmse_f <- function(actual, pred) {
  sqrt(mean((actual - pred) ^ 2))
}


## perf metrics
### rmse
valid_rmse <- rmse_f(trainData$f1_cdi_mean, valid_pred)
test_rmse <- rmse_f(testData$f1_cdi_mean, test_pred)

### calibration slope & at large
valid_cali_m <- lm(trainData$f1_cdi_mean~ valid_pred)
valid_cali_slope <- coef(valid_cali_m)[2]
valid_cali_at_large <- coef(valid_cali_m)[1]

test_cali_m <- lm(testData$f1_cdi_mean~ test_pred)
test_cali_slope <- coef(test_cali_m)[2]
test_cali_at_large <- coef(test_cali_m)[1]

perf_abc <- data.frame(
  Metric = c("RMSE", "Calibration Slope", "Calibration at Large"),
  Validation = c(valid_rmse, valid_cali_slope, valid_cali_at_large),
  Test = c(test_rmse, test_cali_slope, test_cali_at_large)
)
basemodel_dat <- dat_person%>% dplyr::select(-condition,-b_cdi_mean,-b_response_id,-b_screener_age,)
set.seed(1017)

trainIndex <- createDataPartition(basemodel_dat$f1_cdi_mean, p = .8, list = FALSE, times = 1)
trainData <- basemodel_dat[trainIndex, ]
testData <- basemodel_dat[-trainIndex, ]



train_control <- trainControl(method = "LOOCV")

#train_control <- trainControl(method = "cv", number = 3)

# rf_model <- train(f1_cdi_mean ~ ., 
#                   data = trainData, 
#                   method = "rf", 
#                   trControl = train_control)
tune_grid <- expand.grid(alpha = 10^seq(0,1, length = 10),
                         lambda = 10^seq(-3, 1, length = 10))
person_model <- train(f1_cdi_mean ~ ., 
                  data = trainData, 
                  method = "glmnet", 
                  trControl = train_control,
                  tuneGrid = tune_grid,
                  metric = "RMSE")

valid_pred <- predict(person_model, newdata = trainData)

test_pred <- predict(person_model, newdata = testData)

## mse
rmse_f <- function(actual, pred) {
  sqrt(mean((actual - pred) ^ 2))
}


## perf metrics
### rmse
valid_rmse <- rmse_f(trainData$f1_cdi_mean, valid_pred)
test_rmse <- rmse_f(testData$f1_cdi_mean, test_pred)

### calibration slope & at large
valid_cali_m <- lm(trainData$f1_cdi_mean~ valid_pred)
valid_cali_slope <- coef(valid_cali_m)[2]
valid_cali_at_large <- coef(valid_cali_m)[1]

test_cali_m <- lm(testData$f1_cdi_mean~ test_pred)
test_cali_slope <- coef(test_cali_m)[2]
test_cali_at_large <- coef(test_cali_m)[1]

perf_person <- data.frame(
  Metric = c("RMSE", "Calibration Slope", "Calibration at Large"),
  Validation = c(valid_rmse, valid_cali_slope, valid_cali_at_large),
  Test = c(test_rmse, test_cali_slope, test_cali_at_large)
)

Summary of baseline model output of project ABC model:

abc_model %>% summary()
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        1300   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           5   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        13   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          0   -none-     list

Summary of baseline model output of project personality model:

person_model %>% summary()
##             Length Class      Mode     
## a0           65    -none-     numeric  
## beta        845    dgCMatrix  S4       
## df           65    -none-     numeric  
## dim           2    -none-     numeric  
## lambda       65    -none-     numeric  
## dev.ratio    65    -none-     numeric  
## nulldev       1    -none-     numeric  
## npasses       1    -none-     numeric  
## jerr          1    -none-     numeric  
## offset        1    -none-     logical  
## call          5    -none-     call     
## nobs          1    -none-     numeric  
## lambdaOpt     1    -none-     numeric  
## xNames       13    -none-     character
## problemType   1    -none-     character
## tuneValue     2    data.frame list     
## obsLevels     1    -none-     logical  
## param         0    -none-     list

3.2 Evaluate model performance

Summaries the model performance in the following table:

rownames(perf_abc) <- c()
rownames(perf_person) <- c()

kable(perf_abc, caption = "Model performance of baseline risk model (project ABC)")
Table 3.1: Model performance of baseline risk model (project ABC)
Metric Validation Test
RMSE 0.3992314 0.3959155
Calibration Slope 1.1575945 0.9232901
Calibration at Large -0.1543406 0.0670571
kable(perf_person, caption = "Model performance of baseline risk model (project personality)")
Table 3.1: Model performance of baseline risk model (project personality)
Metric Validation Test
RMSE 0.4055562 0.4266310
Calibration Slope 1.0819270 0.0040360
Calibration at Large -0.0809549 0.9874272

3.3 Final baseline prediciton model

The final model is constructed on the entire dataset

basemodel_dat_abc <- dat_abc %>% dplyr::select(-condition,-b_cdi_mean,-b_response_id,-b_screener_age,)

best_lambda_abc <- abc_model$bestTune$lambda
best_alpha <- abc_model$bestTune$alpha ## fixed 0 (ridge)

covariates_abc <- model.matrix(f1_cdi_mean ~ ., data = basemodel_dat_abc)[, -1]  # Create the model matrix
y_abc <- basemodel_dat_abc$f1_cdi_mean 

abc_base_m <- glmnet::glmnet(covariates_abc, y_abc, alpha = best_alpha, lambda = best_lambda_abc)

coef(abc_base_m)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                             s0
## (Intercept)                        1.024079643
## recode_raceBlack/African-American -0.029861721
## recode_raceHispanic/Latinx        -0.067205445
## recode_raceMixed                  -0.008337803
## recode_raceWhite                   0.041071994
## b_dem_sexMale                     -0.121873161
## recode_orientLGBTQ                 0.073487928
## recode_orientOther                 0.102386860
## genderid32                        -0.109173072
## genderid33                        -0.018497319
## family_catOther                    0.060132421
## family_catSchool                   0.020780601
## cope32                            -0.024043888
## cope33                            -0.112895337
abc_base_m <- lm(basemodel_dat_abc$f1_cdi_mean~.,data = basemodel_dat_abc)


# sjPlot::tab_model(abc_base_m)

basemodel_dat_person <- dat_person %>% dplyr::select(-condition,-b_cdi_mean,-b_response_id,-b_screener_age,)

best_lambda_person <- person_model$bestTune$lambda
best_alpha <- abc_model$bestTune$alpha ## fixed 0 (ridge)

covariates_person <- model.matrix(f1_cdi_mean ~ ., data = basemodel_dat_person)[, -1]  # Create the model matrix
y_person <- basemodel_dat_person$f1_cdi_mean 

person_base_m <- glmnet::glmnet(covariates_person, y_person, alpha = best_alpha, lambda = best_lambda_person)
coef(person_base_m)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                             s0
## (Intercept)                        1.152897654
## recode_raceBlack/African-American -0.054500296
## recode_raceHispanic/Latinx        -0.048753462
## recode_raceMixed                  -0.030923716
## recode_raceWhite                  -0.002056701
## b_dem_sexMale                     -0.104694777
## recode_orientLGBTQ                 0.068796666
## recode_orientOther                 0.064098555
## genderid32                        -0.146439033
## genderid33                        -0.102595869
## family_catOther                    0.020356363
## family_catSchool                  -0.015311446
## cope32                            -0.054863363
## cope33                            -0.125546490
person_base_m <- lm(basemodel_dat_person$f1_cdi_mean~.,data = basemodel_dat_person)

# sjPlot::tab_model(person_base_m)

3.3.1 Project ABC baseline model

A summary of the baseline prediction model, for project ABC:

sjPlot::tab_model(abc_base_m)
  Dependent variable
Predictors Estimates CI p
(Intercept) 1.06 0.92 – 1.21 <0.001
recode race
[Black/African-American]
-0.04 -0.16 – 0.08 0.527
recode race
[Hispanic/Latinx]
-0.08 -0.19 – 0.02 0.129
recode race [Mixed] -0.03 -0.13 – 0.08 0.617
recode race [White] 0.03 -0.05 – 0.12 0.462
b dem sex [Male] -0.15 -0.25 – -0.04 0.007
recode orient [LGBTQ] 0.09 0.02 – 0.16 0.010
recode orient [Other] 0.12 0.03 – 0.22 0.008
genderid3 [2] -0.13 -0.20 – -0.06 <0.001
genderid3 [3] -0.02 -0.12 – 0.08 0.687
family cat [Other] 0.07 -0.01 – 0.15 0.070
family cat [School] 0.03 -0.03 – 0.10 0.313
cope3 [2] -0.06 -0.14 – 0.02 0.137
cope3 [3] -0.16 -0.24 – -0.08 <0.001
Observations 977
R2 / R2 adjusted 0.079 / 0.067

3.3.2 Project personality baseline model

for project personality:

sjPlot::tab_model(person_base_m)
  Dependent variable
Predictors Estimates CI p
(Intercept) 1.16 1.00 – 1.31 <0.001
recode race
[Black/African-American]
-0.06 -0.18 – 0.07 0.387
recode race
[Hispanic/Latinx]
-0.05 -0.16 – 0.06 0.382
recode race [Mixed] -0.03 -0.14 – 0.08 0.552
recode race [White] -0.00 -0.09 – 0.09 0.937
b dem sex [Male] -0.11 -0.22 – 0.01 0.066
recode orient [LGBTQ] 0.07 0.00 – 0.14 0.046
recode orient [Other] 0.06 -0.03 – 0.16 0.182
genderid3 [2] -0.15 -0.22 – -0.08 <0.001
genderid3 [3] -0.10 -0.21 – -0.00 0.049
family cat [Other] 0.02 -0.06 – 0.10 0.625
family cat [School] -0.02 -0.09 – 0.05 0.664
cope3 [2] -0.06 -0.15 – 0.03 0.194
cope3 [3] -0.13 -0.22 – -0.04 0.005
Observations 952
R2 / R2 adjusted 0.052 / 0.039

3.4 LRT

LRT was used to test the sig. of variables in the baseline model

For project ABC:

## LRT for each variables

### define model formulas
full_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3 + family_cat + cope3

gender_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + family_cat + cope3

race_formula <- f1_cdi_mean ~  b_dem_sex + recode_orient + genderid3 + family_cat + cope3

sex_formula <- f1_cdi_mean ~ recode_race + recode_orient + genderid3 + family_cat + cope3


orient_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + genderid3 + family_cat + cope3

family_cat_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3  + cope3

cope_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3 + family_cat



full_person <- lm(as.formula(full_formula),data =basemodel_dat_person )
gender_person <- lm(as.formula(gender_formula),data = basemodel_dat_person)
lrt_result <- lmtest::lrtest(gender_person,full_person)

deviance <- lrt_result$Chisq[2]

# Extract the degrees of freedom
df <- lrt_result$Df[2]

# Extract the p-value
p_value <- lrt_result$`Pr(>Chisq)`[2] %>% round(2)




full_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3 + family_cat + cope3
gender_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + family_cat + cope3
race_formula <- f1_cdi_mean ~ b_dem_sex + recode_orient + genderid3 + family_cat + cope3
sex_formula <- f1_cdi_mean ~ recode_race + recode_orient + genderid3 + family_cat + cope3
orient_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + genderid3 + family_cat + cope3
family_cat_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3 + cope3
cope_formula <- f1_cdi_mean ~ recode_race + b_dem_sex + recode_orient + genderid3 + family_cat


full_abc <- lm(as.formula(full_formula), data = basemodel_dat_abc)
full_person <- lm(as.formula(full_formula), data = basemodel_dat_person)

lrt_person_f <- function(full_model, reduced_formula) {
  reduced_model <- lm(as.formula(reduced_formula), data = basemodel_dat_person)
  lrt_result <- lrtest(reduced_model,full_model )

  deviance <- lrt_result$Chisq[2]
  df <- lrt_result$Df[2]
  p_value <- round(lrt_result$`Pr(>Chisq)`[2], 2)

  return(data.frame(Deviance = deviance, DF = df, P_value = p_value))
}

lrt_abc_f <- function(full_model, reduced_formula) {
  reduced_model <- lm(as.formula(reduced_formula), data = basemodel_dat_abc)
  lrt_result <- lrtest(reduced_model,full_model)

  deviance <- lrt_result$Chisq[2]
  df <- lrt_result$Df[2]
  p_value <- round(lrt_result$`Pr(>Chisq)`[2], 2)

  return(data.frame(Deviance = deviance, DF = df, P_value = p_value))
}


reduced_models <- list(
  `Gender identity` = gender_formula,
  `Race` = race_formula,
  `Biological sex` = sex_formula,
  `Sexual Orientation` = orient_formula,
  `Challenges` = family_cat_formula,
  `Coping strategies` = cope_formula
)

## lrt for project ABC
dev_results_abc <- lapply(names(reduced_models), function(var) {
  result <- lrt_abc_f(full_abc, reduced_models[[var]])
  result$Variable <- var
  return(result)
})


Dev_df_abc <- bind_rows(dev_results_abc) %>%
  dplyr::select(Variable,everything())

kable(Dev_df_abc)
Variable Deviance DF P_value
Gender identity 15.456260 2 0.00
Race 9.522943 4 0.05
Biological sex 7.509756 1 0.01
Sexual Orientation 8.728432 2 0.01
Challenges 3.339847 2 0.19
Coping strategies 19.418374 2 0.00
## perform lrt (person)
dev_results_person <- lapply(names(reduced_models), function(var) {
  result <- lrt_person_f(full_person, reduced_models[[var]])
  result$Variable <- var
  return(result)
})




Dev_df_person <- bind_rows(dev_results_person) %>%
  dplyr::select(Variable,everything())

For project personality

kable(Dev_df_person)
Variable Deviance DF P_value
Gender identity 16.358782 2 0.00
Race 2.266644 4 0.69
Biological sex 3.441522 1 0.06
Sexual Orientation 4.084130 2 0.13
Challenges 1.220614 2 0.54
Coping strategies 10.608348 2 0.00

For Project Personality

4 Investigate HTE

4.1 Main effect only model

first, replicate the model in the original paper for comparison. the main effect model which adjusted for baseline CDI score is specified as:\[E(Y|\bf{X}) = \text{baseline CDI}+ \text{condition}\]

abc_main_dat <- dat_abc %>%  dplyr::select(f1_cdi_mean,condition,b_cdi_mean)

abc_main_m <- lm(f1_cdi_mean~ b_cdi_mean + condition,data=abc_main_dat)

person_main_dat <- dat_person %>%  dplyr::select(f1_cdi_mean,condition,b_cdi_mean)

person_main_m <- lm(f1_cdi_mean~ b_cdi_mean + condition,data=person_main_dat)

print the model, compare with the published paper (results are similar): for project ABC vs control:

abc_main_m %>% summary()
## 
## Call:
## lm(formula = f1_cdi_mean ~ b_cdi_mean + condition, data = abc_main_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.18567 -0.21921  0.01075  0.22509  1.04660 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.27471    0.04097   6.705 3.42e-11 ***
## b_cdi_mean            0.64303    0.03263  19.706  < 2e-16 ***
## conditionProject ABC -0.07953    0.02234  -3.560 0.000389 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3492 on 974 degrees of freedom
## Multiple R-squared:  0.2922, Adjusted R-squared:  0.2907 
## F-statistic:   201 on 2 and 974 DF,  p-value: < 2.2e-16
tab_model(abc_main_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.27 0.19 – 0.36 <0.001
b cdi mean 0.64 0.58 – 0.71 <0.001
condition [Project ABC] -0.08 -0.12 – -0.04 <0.001
Observations 977
R2 / R2 adjusted 0.292 / 0.291

for project personality vs control:

person_main_m %>% summary()
## 
## Call:
## lm(formula = f1_cdi_mean ~ b_cdi_mean + condition, data = person_main_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17752 -0.22054  0.01222  0.23434  1.03109 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.31125    0.04188   7.432 2.38e-13 ***
## b_cdi_mean                    0.61148    0.03329  18.366  < 2e-16 ***
## conditionProject Personality -0.06895    0.02337  -2.950  0.00325 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3604 on 949 degrees of freedom
## Multiple R-squared:  0.2664, Adjusted R-squared:  0.2649 
## F-statistic: 172.3 on 2 and 949 DF,  p-value: < 2.2e-16
tab_model(person_main_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.31 0.23 – 0.39 <0.001
b cdi mean 0.61 0.55 – 0.68 <0.001
condition [Project
Personality]
-0.07 -0.11 – -0.02 0.003
Observations 952
R2 / R2 adjusted 0.266 / 0.265

4.2 HTE model

The HTE is defined as: \[E(Y|\bf{X}) = \text{baseline CDI}+ \text{condition} + \text{lp}+\text{condition}\times \text{lp}\] where lp is the linear predictor of the baseline model.

lp_abc <- predict(abc_base_m, dat_abc)
lp_person <- predict(person_base_m, dat_person)

abc_hte_m <- lm(f1_cdi_mean~ b_cdi_mean + condition*lp_abc,data=dat_abc)
person_hte_m <- lm(f1_cdi_mean~ b_cdi_mean + condition*lp_person,data=dat_person)

4.2.1 HTE Project ABC

The hte model for Project ABC vs control:

summary(abc_hte_m)
## 
## Call:
## lm(formula = f1_cdi_mean ~ b_cdi_mean + condition * lp_abc, data = dat_abc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.16587 -0.21245  0.01587  0.22319  1.04761 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  0.20672    0.13296   1.555   0.1203    
## b_cdi_mean                   0.60042    0.03517  17.070   <2e-16 ***
## conditionProject ABC        -0.46326    0.18742  -2.472   0.0136 *  
## lp_abc                       0.11985    0.14022   0.855   0.3929    
## conditionProject ABC:lp_abc  0.39263    0.19023   2.064   0.0393 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3471 on 972 degrees of freedom
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.2991 
## F-statistic: 105.1 on 4 and 972 DF,  p-value: < 2.2e-16
tab_model(abc_hte_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.21 -0.05 – 0.47 0.120
b cdi mean 0.60 0.53 – 0.67 <0.001
condition [Project ABC] -0.46 -0.83 – -0.10 0.014
lp abc 0.12 -0.16 – 0.40 0.393
condition [Project ABC] ×
lp abc
0.39 0.02 – 0.77 0.039
Observations 977
R2 / R2 adjusted 0.302 / 0.299

4.2.2 HTE Project personality

The hte model for Project personality vs control:

summary(person_hte_m)
## 
## Call:
## lm(formula = f1_cdi_mean ~ b_cdi_mean + condition * lp_person, 
##     data = dat_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1604 -0.2189  0.0176  0.2321  1.0427 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                             0.18881    0.16684   1.132    0.258    
## b_cdi_mean                              0.58760    0.03544  16.580   <2e-16 ***
## conditionProject Personality           -0.28385    0.24270  -1.170    0.242    
## lp_person                               0.15184    0.17399   0.873    0.383    
## conditionProject Personality:lp_person  0.21744    0.24431   0.890    0.374    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3599 on 947 degrees of freedom
## Multiple R-squared:   0.27,  Adjusted R-squared:  0.2669 
## F-statistic: 87.56 on 4 and 947 DF,  p-value: < 2.2e-16
tab_model(person_hte_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.19 -0.14 – 0.52 0.258
b cdi mean 0.59 0.52 – 0.66 <0.001
condition [Project
Personality]
-0.28 -0.76 – 0.19 0.242
lp person 0.15 -0.19 – 0.49 0.383
condition [Project
Personality] × lp person
0.22 -0.26 – 0.70 0.374
Observations 952
R2 / R2 adjusted 0.270 / 0.267

5 Evaluate HTE

The HTE is evaluated using calibration plot.

5.1 Point estiamtes

First, report the point estiamtes of ATE and cATE.

## compute average treatment effect
#ate_dat <- dat_abc

# ate_dat$condition = 'Placebo Control'
# 
# abc_ate_ctrl <- predict(abc_main_m,ate_dat)
# 
# ate_dat$condition  = 'Project ABC'
# 
# abc_ate_trt <- predict(abc_main_m,ate_dat)
# 
# abc_ate_trt-abc_ate_ctrl

abc_ate <- coef(abc_main_m)[3]
person_ate <- coef(person_main_m)[3]

## cATE/HTE
## project ABC vs control
hte_dat <- dat_abc

hte_dat$condition = 'Placebo Control'
abc_hte_ctrl <- predict(abc_hte_m,hte_dat)

hte_dat$condition  = 'Project ABC'
abc_hte_trt <- predict(abc_hte_m,hte_dat)

abc_hte <- abc_hte_trt-abc_hte_ctrl

## hte personality vs control
hte_dat <- dat_person

hte_dat$condition = 'Placebo Control'
person_hte_ctrl <- predict(person_hte_m,hte_dat)

hte_dat$condition  = 'Project Personality'
person_hte_trt <- predict(person_hte_m,hte_dat)
person_hte <- person_hte_trt-person_hte_ctrl


point_df <- data.frame(Comparsion= c("Project ABC vs Control","Project personality vs Control" ),
                       ATE = c(abc_ate,person_ate),
                       SE = c(summary(abc_main_m)$coefficients[, "Std. Error"][3],
                              summary(person_main_m)$coefficients[, "Std. Error"][3])
                       )

kable(point_df, caption = "summary of avearged treatment effect")
(#tab:abc_cali)summary of avearged treatment effect
Comparsion ATE SE
conditionProject ABC Project ABC vs Control -0.0795347 0.0223413
conditionProject Personality Project personality vs Control -0.0689513 0.0233696
## calibration plot
abc_cal_dat <- data.frame(abc_ate,
                      abc_hte,
                      lp_abc)

abc_cal_dat$quantile_grp <- cut(abc_cal_dat$lp_abc,
                           breaks = quantile(abc_cal_dat$lp_abc, probs = seq(0, 1, by = 0.2)),
                           include.lowest = TRUE,
                           labels = FALSE)

## compute the averaged hte effect in each quantile risk grp
abc_avg_hte <- abc_cal_dat %>%
  group_by(quantile_grp) %>%
  summarise(abc_avg_hte = mean(abc_hte, na.rm = TRUE))
## replciate for project personality vs control

## calibration plot
person_cal_dat <- data.frame(person_ate,
                          person_hte,
                          lp_person)

person_cal_dat$quantile_grp <- cut(person_cal_dat$lp_person,
                               breaks = quantile(person_cal_dat$lp_person, probs = seq(0, 1, by = 0.2)),
                               include.lowest = TRUE,
                               labels = FALSE)

## compute the averaged hte effect in each quantile risk grp
person_avg_hte <- person_cal_dat %>%
  group_by(quantile_grp) %>%
  summarise(person_avg_hte = mean(person_hte, na.rm = TRUE))

### this is not correct, should boot from the very beginning
## compute bootstrapped CI
# set.seed(1017)
# person_boot_res <- NULL
# for (i in 1:1000){
#   boot.idx <- sample(1:dim(person_cal_dat)[1], size = dim(person_cal_dat)[1], replace = T)
#   boot.data <- person_cal_dat[boot.idx,]
#   boot.data$quantile_grp <- cut(boot.data$lp_person,
#                                   breaks = quantile(boot.data$lp_person, probs = seq(0, 1, by = 0.2)),
#                                   include.lowest = TRUE,
#                                   labels = FALSE)
#   
#   person_avg_hte <- boot.data %>%
#     group_by(quantile_grp) %>%
#     summarise(person_avg_hte = mean(person_hte, na.rm = TRUE))
#   
#   person_boot_res <- rbind(person_boot_res,person_avg_hte)
# }
# 
# person_boot_cali_ci <- person_boot_res %>%
#   group_by(quantile_grp) %>%
#   summarise(
#     lower_quantile = round(quantile(person_avg_hte, probs = 0.025), 4),
#     upper_quantile = round(quantile(person_avg_hte, probs = 0.975), 4)
#   )
# 
# 
# person_cali_result <- left_join(person_avg_hte,person_boot_cali_ci)

A brief summary of the linear predictors:

abc_cal_dat$lp_person %>% summary()
## Length  Class   Mode 
##      0   NULL   NULL
person_cal_dat$lp_person %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7486  0.9316  0.9751  0.9888  1.0464  1.2441

5.2 Bootstrapped CI

compute bootstrapped CI for cATE:

## define bootstrap function
boot_f <- function(mydat=NULL,seed=1017){
  set.seed(seed)
  abc_boot_res <- NULL
  for (i in 1:1000){
    boot.idx <- sample(1:dim(mydat)[1], size = dim(mydat)[1], replace = T)
    boot.data <- mydat[boot.idx,]
    
    abc_main_dat <- boot.data %>%  dplyr::select(f1_cdi_mean,condition,b_cdi_mean)
    
    abc_main_m <- lm(f1_cdi_mean~ b_cdi_mean + condition,data=abc_main_dat)
    
    lp_abc <- predict(abc_main_m,boot.data)
    
    abc_ate <- coef(abc_main_m)[3]
    
    abc_hte_m <- lm(f1_cdi_mean~ b_cdi_mean + condition*lp_abc,data=boot.data)
    
    ## cATE/HTE
    ## project ABC vs control
    hte_dat <- boot.data
    
    trt_levels <- unique(boot.data$condition) %>% as.character()
    
    trt <- trt_levels[!(trt_levels %in% 'Placebo Control')]
    
    hte_dat$condition = 'Placebo Control'
    abc_hte_ctrl <- predict(abc_hte_m,hte_dat)
    
    hte_dat$condition  = trt 
    abc_hte_trt <- predict(abc_hte_m,hte_dat)
    
    abc_hte <- abc_hte_trt-abc_hte_ctrl
    
    
    
    ## calibration plot
    abc_cal_dat <- data.frame(abc_ate,
                              abc_hte,
                              lp_abc)
    
    abc_cal_dat$quantile_grp <- cut(abc_cal_dat$lp_abc,
                                    breaks = quantile(abc_cal_dat$lp_abc, probs = seq(0, 1, by = 0.2)),
                                    include.lowest = TRUE,
                                    labels = FALSE)
    
    ## compute the averaged hte effect in each quantile risk grp
    avg_hte <- abc_cal_dat %>%
      group_by(quantile_grp) %>%
      summarise(avg_hte = mean(abc_hte, na.rm = TRUE))
    
    abc_boot_res <- rbind(abc_boot_res,avg_hte)
  }
  return(abc_boot_res)
}
## compute bootstrapped CI
abc_boot_res <- boot_f(mydat = dat_abc)
abc_boot_cali_ci <- abc_boot_res %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )
abc_cali_result <- left_join(abc_avg_hte,abc_boot_cali_ci)

for Project personality:

## for personality

person_boot_res <- boot_f(mydat = dat_person)

person_boot_cali_ci <- person_boot_res %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )


person_cali_result <- left_join(person_avg_hte,person_boot_cali_ci)

## identify the bounds for cali plots

max_cali_y <- max(abc_cali_result$upper_quantile,person_cali_result$upper_quantile) 

min_cali_y <- min(abc_cali_result$lower_quantile,person_cali_result$lower_quantile) 

## check if the ylims are correct 
### I used -0.2 and 0.5 range for the y axis in the calibration plots

logic1 <- min_cali_y <=  0.05 & min_cali_y >=-0.2

logic2 <- max_cali_y <=  0.05 & max_cali_y >=-0.2

if(!logic1 & ! logic2){
  print(paste0("please make sure the range of y axis in the following calibration plot is [", min_cali_y, ",",max_cali_y,']'))
}else{logic_3 = logic1+logic2}

5.3 Calibration plots

The linear predictor entered model as a continuous variable. For presentation purpose, the linear predictor is discretized into five “risk” groups using quantiles (0.2 incremental). The averaged HTEs/cATEs by “risk” group are calculated and compared with the ATE.

abc_cali_plot <- ggplot(abc_cali_result, aes(x = as.factor(quantile_grp), y = abc_avg_hte)) +
  geom_point(size = 2.5) +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.05, color = "black") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
  ) +
  geom_hline(yintercept = abc_ate, linetype = "dashed", color = "grey")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, 0.05), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
 # ylim(c(min_cali_y,max_cali_y))+
  ylab("Mean difference")+
  # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))

For Project ABC vs control:

abc_lp_plot <- ggplot(abc_cal_dat, aes(x = lp_abc)) +
  geom_histogram(binwidth = 0.02, fill = "black") +
  labs(x = "Predicted CDI mean score using baseline covariates", y = NULL) +
  theme_minimal()

abc_combo_plot <- cowplot::plot_grid(
  abc_cali_plot,
  abc_lp_plot,
  ncol = 1,
  align = "v",
  rel_heights = c(3, 1)  
)



# Display the combined plot
abc_combo_plot

abc_cal_dat$lp_abc %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6620  0.8980  0.9743  0.9782  1.0605  1.2931

For Project personality vs control:

person_cali_plot <- ggplot(person_cali_result, aes(x = as.factor(quantile_grp), y = person_avg_hte)) +
  geom_point(size = 2.5) +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.05, color = "black") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
       ) +
  geom_hline(yintercept = person_ate, linetype = "dashed", color = "grey")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, 0.05), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
  ylab("Mean difference")+
 # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))
person_lp_plot <- ggplot(person_cal_dat, aes(x = lp_person)) +
  geom_histogram(binwidth = 0.01, fill = "black") +
  labs(x = "Predicted CDI mean score using baseline covariates", y = NULL) +
  theme_minimal()

person_combo_plot <- cowplot::plot_grid(
  person_cali_plot,
  person_lp_plot,
  ncol = 1,
  align = "v",
  rel_heights = c(3, 1)  
)

person_combo_plot

person_cal_dat$lp_person %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7486  0.9316  0.9751  0.9888  1.0464  1.2441

combined plots for reporting purpose:

cal_plots <- ggarrange(abc_combo_plot, person_combo_plot, ncol = 2, nrow = 1)
cal_plots

ggsave("figures/cal_plots.png", width = 10, height = 7, dpi = 600)
## combined plot for report


abc_lp_plot <- ggplot(abc_cal_dat, aes(x = lp_abc)) +
  geom_histogram(binwidth = 0.03, fill = "#1f78b4",alpha=0.8) +
  labs(x = "Predicted CDI-SF mean score (Project ABC)", y = NULL) +
  theme_minimal()

person_lp_plot <- ggplot(person_cal_dat, aes(x = lp_person)) +
  geom_histogram(binwidth = 0.03, fill = "#8B0000",alpha=0.8) +
  labs(x = "Predicted CDI-SF mean score (Project Personality)", y = NULL) +
  theme_minimal()


cal_plots_report <- ggarrange(abc_lp_plot, person_lp_plot, ncol = 2, nrow = 1)
cal_plots_report

ggsave("figures/cal_plots_report.png", width = 10, height = 7, dpi = 600)


abc_cal_dat_report <- abc_cal_dat

person_cal_dat_report <- person_cal_dat


abc_cal_dat_report$grp ="Project ABC"

person_cal_dat_report$grp = "Project Personality"

colnames(person_cal_dat_report) = colnames(abc_cal_dat_report)

combined_cal_dat_report<- rbind(abc_cal_dat_report,person_cal_dat_report)


combined_hist_plot <-ggplot(combined_cal_dat_report, aes(x = lp_abc, fill = grp)) +
  geom_histogram(binwidth = 0.02, alpha = 0.8, position = "identity") +
  labs(x = "Predicted 3-month CDI-SF mean score using baseline prediction models", y = "Frequency", fill = "Project") +
  theme_minimal() +
  scale_fill_manual(values = c("Project ABC" = "#1f78b4", "Project Personality" = "#8B0000"))

combined_hist_plot

ggsave("figures/combined_hist_plot.png", width = 10, height = 7, dpi = 600)

5.3.1 Rearrange for report (For Porject ABC vs control)

Rearrange tables for reporting purpose:

max_cali_y <- 0.065
boot1_f <- function(mydat=NULL,seed=1017){
  set.seed(seed)
  abc_boot_res <- NULL
  for (i in 1:1000){
    boot.idx <- sample(1:dim(mydat)[1], size = dim(mydat)[1], replace = T)
    boot.data <- mydat[boot.idx,]

    abc_main_dat <- boot.data %>%  dplyr::select(f1_cdi_mean,condition,b_cdi_mean)

    abc_main_m <- lm(f1_cdi_mean~ b_cdi_mean + condition,data=abc_main_dat)

    lp_abc <- predict(abc_main_m,boot.data)

    abc_ate <- coef(abc_main_m)[3]

    abc_hte_m <- lm(f1_cdi_mean~ b_cdi_mean + condition*lp_abc,data=boot.data)

    ## cATE/HTE
    ## project ABC vs control
    hte_dat <- boot.data

    trt_levels <- unique(boot.data$condition) %>% as.character()

    trt <- trt_levels[!(trt_levels %in% 'Placebo Control')]

    hte_dat$condition = 'Placebo Control'
    abc_hte_ctrl <- predict(abc_hte_m,hte_dat)

    hte_dat$condition  = trt
    abc_hte_trt <- predict(abc_hte_m,hte_dat)

    abc_hte <- abc_hte_trt-abc_hte_ctrl

    ## calibration plot
    abc_cal_dat <- data.frame(abc_ate,
                              abc_hte,
                              lp_abc)

    abc_cal_dat$quantile_grp <- cut(abc_cal_dat$lp_abc,
                                    breaks = quantile(abc_cal_dat$lp_abc, probs = seq(0, 1, by = 0.1)),
                                    include.lowest = TRUE,
                                    labels = FALSE)

    ## compute the averaged hte effect in each quantile risk grp
    avg_hte <- abc_cal_dat %>%
      group_by(quantile_grp) %>%
      summarise(avg_hte = mean(abc_hte, na.rm = TRUE))

    abc_boot_res <- rbind(abc_boot_res,avg_hte)
  }
  return(abc_boot_res)
}
## calibration plot
abc_cal_dat_copy <- data.frame(abc_ate,
                      abc_hte,
                      lp_abc) 


abc_cal_dat_copy$quantile_grp <- cut(abc_cal_dat_copy$lp_abc,
                           breaks = quantile(abc_cal_dat_copy$lp_abc, probs = seq(0, 1, by = 0.1)),
                           include.lowest = TRUE,
                           labels = FALSE)

abc_cal_dat_copy$condition <- dat_abc$condition

cali_10group <- table(abc_cal_dat_copy$condition,abc_cal_dat_copy$quantile_grp) %>%  as.data.frame()

wide_10group <- tidyr::pivot_wider(cali_10group, names_from = Var1, values_from = Freq)

## risk quantiles
quantiles <- quantile(abc_cal_dat_copy$lp_abc, probs = seq(0, 1, by = 0.1)) %>% round(2)

base_score_int <- paste(head(quantiles, -1), tail(quantiles, -1), sep = " - ")

wide_10group$base_score <- base_score_int


wide_10group <-  wide_10group %>% 
  dplyr::select(Var2, base_score,everything())




## compute the averaged hte effect in each quantile risk grp
abc_avg_hte_copy <- abc_cal_dat_copy %>%
  group_by(quantile_grp) %>%
  summarise(abc_avg_hte = mean(abc_hte, na.rm = TRUE))


###boot CI

abc_boot_res_copy <- boot1_f(mydat = dat_abc)
abc_boot_cali_ci_copy <- abc_boot_res_copy %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )
abc_cali_result_copy <- left_join(abc_avg_hte_copy,abc_boot_cali_ci_copy)
## Joining with `by = join_by(quantile_grp)`
abc_cali_result_copy <- apply(abc_cali_result_copy, 2, function(x) round(x, 2)) %>% data.frame()

abc_cali_result_copy$mean_diff <- paste0( abc_cali_result_copy$abc_avg_hte,"(",abc_cali_result_copy$lower_quantile,",",abc_cali_result_copy$upper_quantile,")")
### calibration plot data prep
cali10_plot_dat <- cbind(wide_10group,abc_cali_result_copy)

cali10_plot <-  ggplot(cali10_plot_dat, aes(x = as.factor(quantile_grp), y = abc_avg_hte)) +
  geom_point(size = 5) +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile),width = 0.3,linewidth = 1, color = "black") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
       ) +
  geom_hline(yintercept = abc_ate, linetype = "dashed", color = "grey")+
   geom_text(aes(x = 10, y = round(abc_ate,3), label = paste(" ATE=", round(abc_ate,3))), 
            vjust = -1, hjust = 0.5, color = "black")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, 0.1), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
  ylab("Mean difference")+
  coord_flip()+
 # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))
## for reporting purpose
wide_10group$`Mean difference` <- abc_cali_result_copy$mean_diff

colnames(wide_10group) <- c("Score group", "Score interval", "Placebo Control", "Project ABC", "Mean difference"
)

wide_10group_plot <- wide_10group

wide_10group_plot$`Score group` <- as.character(wide_10group_plot$`Score group`)
wide_10group_plot$`Score group`[wide_10group_plot$`Score group` == "1"] <- "1 (Lowest)"
wide_10group_plot$`Score group`[wide_10group_plot$`Score group` == "10"] <- "10 (Highest)"

The calibration table and plot:

cali10_plot_dat$quantile_grp = factor(cali10_plot_dat$quantile_grp,levels =c("1","2","3","4","5","6","7","8","9","10") )
extended_levels <-c(rev(levels(cali10_plot_dat$quantile_grp)),11)
new_labels <- c(as.character(c(10:1)),"")

cali10_plot <- ggplot(cali10_plot_dat, aes(x = as.factor(quantile_grp), y = abc_avg_hte)) +
  geom_point(size = 5,shape=15,color = "#1f78b4") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.3,linewidth = 1,color = "#1f78b4") +
  labs(x = "", y = "Mean difference of 3-month CDI-SF mean score(Project ABC vs Placebo Control)") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey") +
  geom_hline(yintercept = abc_ate, linetype = "dashed",linewidth = 0.7,color = "#1f78b4") +
     geom_text(aes(x = 10, y = round(abc_ate,2), label = paste(" ATE=", round(abc_ate,2))), 
            vjust = -1, hjust = 0.5, color = "#1f78b4")+
  theme_minimal() +
  scale_y_continuous(limits = c(-0.2, max_cali_y), breaks = seq(-0.2, 0.1, by = 0.05)) +
  coord_flip() +
  #scale_x_discrete(limits = rev(levels(cali10_plot_dat$quantile_grp)))+
  scale_x_discrete(limits = extended_levels,labels = new_labels) +  
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 12))+
  theme(axis.text.x = element_text(size = 12))
cali10_plot

#### table plot
wide10plot <- wide_10group_plot %>% mutate(`Placebo Control` = as.character(`Placebo Control`),
                                           `Project ABC` = as.character(`Project ABC`),
                                           `Num.~Control/ABC^1`= paste0(`Placebo Control`,"/",`Project ABC`)
                                           #`Score~interval^1` = `Score interval`

)

wide10plot <- rbind(colnames(wide10plot),wide10plot)

wide10plot1 <- wide10plot
wide10plot1$`Score group` <- factor(wide10plot$`Score group`,
                                    levels = c("Score group", "1 (Lowest)", "2", "3", "4", "5", "6", "7",
                                               "8", "9", "10 (Highest)"))

wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.07(-0.12,0)"] = "-0.07(-0.12,0.00)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.1(-0.16,-0.04)"] = "-0.10(-0.16,-0.04)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.11(-0.2,-0.04)"] = "-0.11(-0.20,-0.04)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.05(-0.12,0)"] = "-0.05(-0.12,0.00)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="0(-0.14,0.06)"] = "0.00(-0.14,0.06)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.16(-0.2,-0.04)"] = "-0.16(-0.20,-0.04)"
# wide10plot1$`Score group`[wide10plot1$`Score group`=="Score group"] = "Score~group^1"
# wide10plot1$`Score interval`[wide10plot1$`Score interval`=="Score interval"] = "Score~interval^2"
# wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="Mean difference"] = "Mean~difference^4"



wide10plot1$myscale <- c(1.2,2.1,3.1,4.1,5,6,7,8,9,10,11)
wide10plot1$myscale <- c(1.1, c(2:11)+0.05)



table_text <- ggplot(wide10plot1, aes(x = 1, y = myscale)) +
  geom_text(aes(label = `Score group`), size = 5, hjust = 0,nudge_x = 0) +
  geom_text(aes(label =`Score interval` ), size = 5,  hjust = 0.5, vjust = 0.5, nudge_x = 1.2) +
  #geom_text(aes(label = `Placebo Control`), size = 5, hjust = 0, nudge_x = 1.8) +
  #geom_text(aes(label = `Project ABC`), size = 5, hjust = 0, nudge_x = 3) +
  geom_text(aes(label = `Num.~Control/ABC^1`), size = 5, hjust = 0.5, vjust = 0.5, nudge_x = 2.4,parse = TRUE)+
  geom_text(aes(label = `Mean difference`), size = 5,  hjust = 0.5, vjust = 0.5, nudge_x = 3.75) +
  theme_void() +
  xlim(1, 5) +
  ylim(12, 1)



risk_strata_plot <- plot_grid(table_text, cali10_plot, ncol = 2, rel_widths = c(2, 3))

table_text

#risk_strata_plot

# ggsave("figures/abc_risk_strata.png", risk_strata_plot, width = 12, height = 6, dpi = 600)

5.3.2 Rearrange for report (For Porject personality vs control)

## calibration plot
person_cal_dat_copy <- data.frame(person_ate,
                               person_hte,
                               lp_person) 


person_cal_dat_copy$quantile_grp <- cut(person_cal_dat_copy$lp_person,
                                     breaks = quantile(person_cal_dat_copy$lp_person, probs = seq(0, 1, by = 0.1)),
                                     include.lowest = TRUE,
                                     labels = FALSE)

person_cal_dat_copy$condition <- dat_person$condition

cali_10group <- table(person_cal_dat_copy$condition,person_cal_dat_copy$quantile_grp) %>%  as.data.frame()

wide_10group <- tidyr::pivot_wider(cali_10group, names_from = Var1, values_from = Freq)

## risk quantiles
quantiles <- quantile(person_cal_dat_copy$lp_person, probs = seq(0, 1, by = 0.1)) %>% round(2)

base_score_int <- paste(head(quantiles, -1), tail(quantiles, -1), sep = " - ")

wide_10group$base_score <- base_score_int


wide_10group <-  wide_10group %>% 
  dplyr::select(Var2, base_score,everything())




## compute the averaged hte effect in each quantile risk grp
person_avg_hte_copy <- person_cal_dat_copy %>%
  group_by(quantile_grp) %>%
  summarise(person_avg_hte = mean(person_hte, na.rm = TRUE))


###boot CI

person_boot_res_copy <- boot1_f(mydat = dat_person)
person_boot_cali_ci_copy <- person_boot_res_copy %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )
person_cali_result_copy <- left_join(person_avg_hte_copy,person_boot_cali_ci_copy)
## Joining with `by = join_by(quantile_grp)`
person_cali_result_copy <- apply(person_cali_result_copy, 2, function(x) round(x, 2)) %>% data.frame()

person_cali_result_copy$mean_diff <- paste0( person_cali_result_copy$person_avg_hte,"(",person_cali_result_copy$lower_quantile,",",person_cali_result_copy$upper_quantile,")")
### calibration plot data prep
cali10_plot_dat <- cbind(wide_10group,person_cali_result_copy)

cali10_plot <-  ggplot(cali10_plot_dat, aes(x = as.factor(quantile_grp), y = person_avg_hte)) +
  geom_point(size = 5) +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.3,linewidth = 1, color = "black") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
  ) +
  geom_hline(yintercept = person_ate, linetype = "dashed",linewidth = 0.7, color = "grey")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, max_cali_y), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
  ylab("Mean difference")+
  coord_flip()+
  # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))
## for reporting purpose
wide_10group$`Mean difference` <- person_cali_result_copy$mean_diff

colnames(wide_10group) <- c("Score group", "Score interval", "Placebo Control", "Project Personality", "Mean difference"
)

wide_10group_plot <- wide_10group

wide_10group_plot$`Score group` <- as.character(wide_10group_plot$`Score group`)
wide_10group_plot$`Score group`[wide_10group_plot$`Score group` == "1"] <- "1 (Lowest)"
wide_10group_plot$`Score group`[wide_10group_plot$`Score group` == "10"] <- "10 (Highest)"

The calibration table and plot:

cali10_plot_dat$quantile_grp = factor(cali10_plot_dat$quantile_grp,levels =c("1","2","3","4","5","6","7","8","9","10") )
extended_levels <-c(rev(levels(cali10_plot_dat$quantile_grp)),11)
new_labels <- c(as.character(c(10:1)),"")

cali10_plot <- ggplot(cali10_plot_dat, aes(x = as.factor(quantile_grp), y = person_avg_hte)) +
  geom_point(size = 5,shape=15,color = "#8B0000") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile),width = 0.3,linewidth = 1,color = "#8B0000") +
  labs(x = "", y = "Mean difference of 3-month CDI-SF mean score(Project Personality vs Placebo Control)") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey") +
  geom_hline(yintercept = person_ate, linetype = "dashed",color = "#8B0000") +
   geom_text(aes(x = 10, y = round(person_ate,3), label = paste(" ATE=", round(person_ate,3))), 
            vjust = -1, hjust = 0.5, color = "#8B0000")+
  theme_minimal() +
  scale_y_continuous(limits = c(-0.2, max_cali_y), breaks = seq(-0.2, max_cali_y, by = 0.05)) +
  coord_flip() +
  #scale_x_discrete(limits = rev(levels(cali10_plot_dat$quantile_grp)))+
  scale_x_discrete(limits = extended_levels,labels = new_labels) +  # 反转x轴顺序
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 12))+
  theme(axis.text.x = element_text(size = 12))
cali10_plot

#### table plot
wide10plot <- wide_10group_plot %>% mutate(`Placebo Control` = as.character(`Placebo Control`),
                                           `Project Personality` = as.character(`Project Personality`),
                                           `Num.~Control/Personality^2`= paste0(`Placebo Control`,"/",`Project Personality`)
                                           #`Score~interval^1` = `Score interval`
                                           
)

wide10plot <- rbind(colnames(wide10plot),wide10plot)

wide10plot1 <- wide10plot
wide10plot1$`Score group` <- factor(wide10plot$`Score group`,
                                    levels = c("Score group", "1 (Lowest)", "2", "3", "4", "5", "6", "7",
                                               "8", "9", "10 (Highest)"))

wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.07(-0.12,0)"] = "-0.07(-0.12,0.00)"
wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.1(-0.16,-0.04)"] = "-0.10(-0.16,-0.04)"
wide10plot1$`Score interval`[wide10plot1$`Score interval`=="0.98 - 1"] = "0.98 - 1.00"
wide10plot1$`Score interval`[wide10plot1$`Score interval`=="0.87 - 0.9"] = "0.87 - 0.90"
wide10plot1$`Score interval`[wide10plot1$`Score interval`=="1 - 1.03"] = "1.00 - 1.03"

wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.06(-0.14,0)"] = "-0.06(-0.14,0.00)"

wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="-0.1(-0.15,0.01)"] = "-0.10(-0.15,0.01)"
# wide10plot1$`Score group`[wide10plot1$`Score group`=="Score group"] = "Score~group^1"
# wide10plot1$`Score interval`[wide10plot1$`Score interval`=="Score interval"] = "Score~interval^2"
# wide10plot1$`Mean difference`[wide10plot1$`Mean difference`=="Mean difference"] = "Mean~difference^4"



wide10plot1$myscale <- c(1.2,2.1,3.1,4.1,5,6,7,8,9,10,11)
wide10plot1$myscale <- c(1.1, c(2:11)+0.05)


table_text <- ggplot(wide10plot1, aes(x = 1, y = myscale)) +
  geom_text(aes(label = `Score group`), size = 5, hjust = 0,nudge_x = 0) +
  geom_text(aes(label =`Score interval` ), size = 5,  hjust = 0.5, vjust = 0.5, nudge_x = 1.2) +
  #geom_text(aes(label = `Placebo Control`), size = 5, hjust = 0, nudge_x = 1.8) +
  #geom_text(aes(label = `Project Personality`), size = 5, hjust = 0, nudge_x = 3) +
  geom_text(aes(label = `Num.~Control/Personality^2`), size = 5, hjust = 0.5, vjust = 0.5, nudge_x = 2.4,parse = TRUE)+
  geom_text(aes(label = `Mean difference`), size = 5,  hjust = 0.5, vjust = 0.5, nudge_x = 3.75) +
  theme_void() +
  xlim(1, 5) +
  ylim(12, 1)


risk_strata_plot_person <-plot_grid(table_text, cali10_plot, ncol = 2, rel_widths = c(2, 3))


# ggsave("figures/person_risk_strata.png", risk_strata_plot_person, width = 12, height = 6, dpi = 600)


risk_plots_combined <-  ggarrange(risk_strata_plot, risk_strata_plot_person, ncol = 1, nrow = 2)
table_text

# # risk_plots_combined
# ggsave("figures/risk_plots_combined.png", width = 12, height = 6, dpi = 600)

combined_plot <- arrangeGrob(risk_strata_plot, risk_strata_plot_person, ncol = 1, nrow = 2)

# Save the combined plot to a file with specified dimensions
ggsave("figures/combined_risk_plot.png", combined_plot, width = 18.2, height = 12)

5.4 Distribution in high risk groups

risk_abc_dat <- cbind(table1_dat %>% filter(condition %in% c("Project ABC", "Placebo Control")),abc_cal_dat_copy %>% 
dplyr::select(-condition))

risk_table1_abc <-tbl_summary(risk_abc_dat %>% dplyr::select(-id,-`3-month CDI mean score`),
                      by = quantile_grp, statistic = list(all_continuous() ~ "{mean} ({sd})",
                                                                        all_categorical() ~ "{n} ({p}%)"), digits = all_continuous() ~ 2,
                      # label = list(
                      #              b_dem_sex ~ "Biological Sex",
                      #              b_dem_orientation ~ "Sexual Orientation"
                      #              #,
                      #              # b_cdi_sum ~ "Baseline CDI Sum Score (0-24)"
                      #              )
                      ) %>%
  modify_header(label ~ "**Demographics**") 

risk_table1_abc_df <- as.data.frame(risk_table1_abc)
latex_table1_abc <- xtable(risk_table1_abc_df)

writeLines(print(latex_table1_abc, type = "latex",include.rownames = FALSE), paste0("table1_df_abc_",Sys.Date(),".tex"))
## % latex table generated in R 4.3.1 by xtable 1.8-4 package
## % Fri Aug  2 16:25:17 2024
## \begin{table}[ht]
## \centering
## \begin{tabular}{lllllllllll}
##   \hline
## **Demographics** & **1**  
## N = 98 & **10**  
## N = 94 & **2**  
## N = 104 & **3**  
## N = 92 & **4**  
## N = 97 & **5**  
## N = 101 & **6**  
## N = 95 & **7**  
## N = 103 & **8**  
## N = 91 & **9**  
## N = 102 \\ 
##   \hline
## condition &  &  &  &  &  &  &  &  &  &  \\ 
##   Placebo Control & 44 (45\%) & 42 (45\%) & 57 (55\%) & 52 (57\%) & 51 (53\%) & 44 (44\%) & 41 (43\%) & 51 (50\%) & 45 (49\%) & 61 (60\%) \\ 
##   Project ABC & 54 (55\%) & 52 (55\%) & 47 (45\%) & 40 (43\%) & 46 (47\%) & 57 (56\%) & 54 (57\%) & 52 (50\%) & 46 (51\%) & 41 (40\%) \\ 
##   Baseline CDI mean score(0-2) & 0.92 (0.31) & 1.37 (0.30) & 1.04 (0.37) & 1.06 (0.33) & 1.05 (0.29) & 1.14 (0.34) & 1.23 (0.31) & 1.21 (0.30) & 1.26 (0.32) & 1.29 (0.30) \\ 
##   Race &  &  &  &  &  &  &  &  &  &  \\ 
##   Asian Including Asian Desi & 18 (18\%) & 1 (1.1\%) & 13 (13\%) & 35 (38\%) & 3 (3.1\%) & 9 (8.9\%) & 10 (11\%) & 9 (8.7\%) & 5 (5.5\%) & 5 (4.9\%) \\ 
##   Black/African-American & 16 (16\%) & 4 (4.3\%) & 16 (15\%) & 5 (5.4\%) & 5 (5.2\%) & 11 (11\%) & 8 (8.4\%) & 3 (2.9\%) & 3 (3.3\%) & 2 (2.0\%) \\ 
##   Hispanic/Latinx & 38 (39\%) & 1 (1.1\%) & 27 (26\%) & 1 (1.1\%) & 16 (16\%) & 20 (20\%) & 2 (2.1\%) & 3 (2.9\%) & 5 (5.5\%) & 5 (4.9\%) \\ 
##   Mixed & 10 (10\%) & 14 (15\%) & 26 (25\%) & 1 (1.1\%) & 13 (13\%) & 17 (17\%) & 21 (22\%) & 11 (11\%) & 12 (13\%) & 17 (17\%) \\ 
##   White & 16 (16\%) & 74 (79\%) & 22 (21\%) & 50 (54\%) & 60 (62\%) & 44 (44\%) & 54 (57\%) & 77 (75\%) & 66 (73\%) & 73 (72\%) \\ 
##   Age (yrs) &  &  &  &  &  &  &  &  &  &  \\ 
##   13 & 3 (3.1\%) & 10 (11\%) & 4 (3.8\%) & 1 (1.1\%) & 5 (5.2\%) & 6 (5.9\%) & 7 (7.4\%) & 8 (7.8\%) & 9 (9.9\%) & 7 (6.9\%) \\ 
##   14 & 12 (12\%) & 16 (17\%) & 13 (13\%) & 19 (21\%) & 11 (11\%) & 21 (21\%) & 15 (16\%) & 12 (12\%) & 20 (22\%) & 19 (19\%) \\ 
##   15 & 32 (33\%) & 30 (32\%) & 30 (29\%) & 29 (32\%) & 27 (28\%) & 28 (28\%) & 27 (28\%) & 39 (38\%) & 27 (30\%) & 37 (36\%) \\ 
##   16 & 51 (52\%) & 38 (40\%) & 57 (55\%) & 43 (47\%) & 54 (56\%) & 46 (46\%) & 46 (48\%) & 44 (43\%) & 35 (38\%) & 39 (38\%) \\ 
##   Biological sex &  &  &  &  &  &  &  &  &  &  \\ 
##   Female & 66 (67\%) & 94 (100\%) & 83 (80\%) & 80 (87\%) & 87 (90\%) & 92 (91\%) & 80 (84\%) & 101 (98\%) & 87 (96\%) & 101 (99\%) \\ 
##   Male & 32 (33\%) & 0 (0\%) & 21 (20\%) & 12 (13\%) & 10 (10\%) & 9 (8.9\%) & 15 (16\%) & 2 (1.9\%) & 4 (4.4\%) & 1 (1.0\%) \\ 
##   Sexual orientation &  &  &  &  &  &  &  &  &  &  \\ 
##   Heterosexual & 77 (79\%) & 0 (0\%) & 45 (43\%) & 39 (42\%) & 5 (5.2\%) & 20 (20\%) & 16 (17\%) & 0 (0\%) & 3 (3.3\%) & 0 (0\%) \\ 
##   LGBTQ & 20 (20\%) & 64 (68\%) & 57 (55\%) & 48 (52\%) & 83 (86\%) & 68 (67\%) & 75 (79\%) & 90 (87\%) & 54 (59\%) & 77 (75\%) \\ 
##   Other & 1 (1.0\%) & 30 (32\%) & 2 (1.9\%) & 5 (5.4\%) & 9 (9.3\%) & 13 (13\%) & 4 (4.2\%) & 13 (13\%) & 34 (37\%) & 25 (25\%) \\ 
##   Language &  &  &  &  &  &  &  &  &  &  \\ 
##   English & 91 (93\%) & 93 (99\%) & 100 (96\%) & 88 (96\%) & 94 (97\%) & 99 (98\%) & 94 (99\%) & 102 (99\%) & 91 (100\%) & 100 (98\%) \\ 
##   Other & 7 (7.1\%) & 1 (1.1\%) & 4 (3.8\%) & 4 (4.3\%) & 3 (3.1\%) & 2 (2.0\%) & 1 (1.1\%) & 1 (1.0\%) & 0 (0\%) & 2 (2.0\%) \\ 
##   Gender identity &  &  &  &  &  &  &  &  &  &  \\ 
##   Non-binary & 1 (1.0\%) & 79 (84\%) & 2 (1.9\%) & 1 (1.1\%) & 5 (5.2\%) & 5 (5.0\%) & 20 (21\%) & 10 (9.7\%) & 27 (30\%) & 50 (49\%) \\ 
##   Women/girls & 70 (71\%) & 0 (0\%) & 83 (80\%) & 80 (87\%) & 86 (89\%) & 88 (87\%) & 61 (64\%) & 87 (84\%) & 48 (53\%) & 35 (34\%) \\ 
##   Male/Masculine & 27 (28\%) & 15 (16\%) & 19 (18\%) & 11 (12\%) & 6 (6.2\%) & 8 (7.9\%) & 14 (15\%) & 6 (5.8\%) & 16 (18\%) & 17 (17\%) \\ 
##   Number of challenges &  &  &  &  &  &  &  &  &  &  \\ 
##   0 & 26 (27\%) & 8 (8.5\%) & 31 (30\%) & 37 (40\%) & 9 (9.3\%) & 9 (8.9\%) & 30 (32\%) & 16 (16\%) & 17 (19\%) & 14 (14\%) \\ 
##   1 & 58 (59\%) & 44 (47\%) & 65 (63\%) & 46 (50\%) & 77 (79\%) & 58 (57\%) & 44 (46\%) & 74 (72\%) & 47 (52\%) & 60 (59\%) \\ 
##   $>$=2 & 14 (14\%) & 42 (45\%) & 8 (7.7\%) & 9 (9.8\%) & 11 (11\%) & 34 (34\%) & 21 (22\%) & 13 (13\%) & 27 (30\%) & 28 (27\%) \\ 
##   Type of challenges &  &  &  &  &  &  &  &  &  &  \\ 
##   No impact & 26 (27\%) & 8 (8.5\%) & 31 (30\%) & 37 (40\%) & 9 (9.3\%) & 9 (8.9\%) & 30 (32\%) & 16 (16\%) & 17 (19\%) & 14 (14\%) \\ 
##   Other & 11 (11\%) & 45 (48\%) & 10 (9.6\%) & 12 (13\%) & 9 (9.3\%) & 33 (33\%) & 27 (28\%) & 6 (5.8\%) & 25 (27\%) & 38 (37\%) \\ 
##   School & 61 (62\%) & 41 (44\%) & 63 (61\%) & 43 (47\%) & 79 (81\%) & 59 (58\%) & 38 (40\%) & 81 (79\%) & 49 (54\%) & 50 (49\%) \\ 
##   Type of coping strategies &  &  &  &  &  &  &  &  &  &  \\ 
##   Combined & 0 (0\%) & 35 (37\%) & 0 (0\%) & 0 (0\%) & 2 (2.1\%) & 7 (6.9\%) & 9 (9.5\%) & 13 (13\%) & 17 (19\%) & 36 (35\%) \\ 
##   No action & 11 (11\%) & 59 (63\%) & 39 (38\%) & 33 (36\%) & 25 (26\%) & 62 (61\%) & 67 (71\%) & 78 (76\%) & 47 (52\%) & 42 (41\%) \\ 
##   Positive & 87 (89\%) & 0 (0\%) & 65 (63\%) & 59 (64\%) & 70 (72\%) & 32 (32\%) & 19 (20\%) & 12 (12\%) & 27 (30\%) & 24 (24\%) \\ 
##   abc\_ate &  &  &  &  &  &  &  &  &  &  \\ 
##   -0.0795346966023499 & 98 (100\%) & 94 (100\%) & 104 (100\%) & 92 (100\%) & 97 (100\%) & 101 (100\%) & 95 (100\%) & 103 (100\%) & 91 (100\%) & 102 (100\%) \\ 
##   abc\_hte & -0.16 (0.01) & 0.00 (0.02) & -0.13 (0.01) & -0.11 (0.00) & -0.10 (0.00) & -0.09 (0.00) & -0.07 (0.00) & -0.06 (0.00) & -0.05 (0.00) & -0.03 (0.01) \\ 
##   lp\_abc & 0.78 (0.03) & 1.19 (0.04) & 0.86 (0.02) & 0.90 (0.01) & 0.92 (0.01) & 0.96 (0.01) & 1.00 (0.01) & 1.03 (0.00) & 1.06 (0.01) & 1.11 (0.02) \\ 
##    \hline
## \end{tabular}
## \end{table}
risk_person_dat <- cbind(table1_dat %>% filter(condition %in% c("Project Personality", "Placebo Control")),person_cal_dat_copy %>% dplyr::select(-condition))

risk_table1_person <- tbl_summary(risk_person_dat %>% dplyr::select(-id,-`3-month CDI mean score`),
                      by = quantile_grp, statistic = list(all_continuous() ~ "{mean} ({sd})",
                                                                        all_categorical() ~ "{n} ({p}%)"), digits = all_continuous() ~ 2,
                      # label = list(
                      #              b_dem_sex ~ "Biological Sex",
                      #              b_dem_orientation ~ "Sexual Orientation"
                      #              #,
                      #              # b_cdi_sum ~ "Baseline CDI Sum Score (0-24)"
                      #              )
                      ) %>%
  modify_header(label ~ "**Demographics**")

risk_table1_person_df <- as.data.frame(risk_table1_person)
latex_table1_person <- xtable(risk_table1_person_df)

writeLines(print(latex_table1_person, type = "latex",include.rownames = FALSE), paste0("table1_df_person_",Sys.Date(),".tex"))
## % latex table generated in R 4.3.1 by xtable 1.8-4 package
## % Fri Aug  2 16:25:19 2024
## \begin{table}[ht]
## \centering
## \begin{tabular}{lllllllllll}
##   \hline
## **Demographics** & **1**  
## N = 101 & **10**  
## N = 96 & **2**  
## N = 94 & **3**  
## N = 103 & **4**  
## N = 83 & **5**  
## N = 95 & **6**  
## N = 120 & **7**  
## N = 72 & **8**  
## N = 94 & **9**  
## N = 94 \\ 
##   \hline
## condition &  &  &  &  &  &  &  &  &  &  \\ 
##   Placebo Control & 56 (55\%) & 46 (48\%) & 53 (56\%) & 46 (45\%) & 44 (53\%) & 47 (49\%) & 63 (53\%) & 33 (46\%) & 46 (49\%) & 54 (57\%) \\ 
##   Project Personality & 45 (45\%) & 50 (52\%) & 41 (44\%) & 57 (55\%) & 39 (47\%) & 48 (51\%) & 57 (48\%) & 39 (54\%) & 48 (51\%) & 40 (43\%) \\ 
##   Baseline CDI mean score(0-2) & 0.93 (0.32) & 1.35 (0.30) & 1.02 (0.39) & 1.06 (0.33) & 1.12 (0.37) & 1.19 (0.31) & 1.19 (0.28) & 1.25 (0.30) & 1.24 (0.36) & 1.29 (0.33) \\ 
##   Race &  &  &  &  &  &  &  &  &  &  \\ 
##   Asian Including Asian Desi & 11 (11\%) & 6 (6.3\%) & 9 (9.6\%) & 3 (2.9\%) & 26 (31\%) & 15 (16\%) & 5 (4.2\%) & 15 (21\%) & 2 (2.1\%) & 8 (8.5\%) \\ 
##   Black/African-American & 14 (14\%) & 3 (3.1\%) & 12 (13\%) & 12 (12\%) & 8 (9.6\%) & 5 (5.3\%) & 4 (3.3\%) & 3 (4.2\%) & 2 (2.1\%) & 6 (6.4\%) \\ 
##   Hispanic/Latinx & 20 (20\%) & 3 (3.1\%) & 26 (28\%) & 8 (7.8\%) & 0 (0\%) & 24 (25\%) & 13 (11\%) & 5 (6.9\%) & 3 (3.2\%) & 8 (8.5\%) \\ 
##   Mixed & 9 (8.9\%) & 14 (15\%) & 23 (24\%) & 10 (9.7\%) & 9 (11\%) & 12 (13\%) & 9 (7.5\%) & 19 (26\%) & 17 (18\%) & 15 (16\%) \\ 
##   White & 47 (47\%) & 70 (73\%) & 24 (26\%) & 70 (68\%) & 40 (48\%) & 39 (41\%) & 89 (74\%) & 30 (42\%) & 70 (74\%) & 57 (61\%) \\ 
##   Age (yrs) &  &  &  &  &  &  &  &  &  &  \\ 
##   13 & 3 (3.0\%) & 7 (7.3\%) & 4 (4.3\%) & 1 (1.0\%) & 3 (3.6\%) & 10 (11\%) & 9 (7.5\%) & 7 (9.7\%) & 7 (7.4\%) & 5 (5.3\%) \\ 
##   14 & 15 (15\%) & 17 (18\%) & 11 (12\%) & 18 (17\%) & 15 (18\%) & 9 (9.5\%) & 15 (13\%) & 13 (18\%) & 13 (14\%) & 14 (15\%) \\ 
##   15 & 39 (39\%) & 28 (29\%) & 28 (30\%) & 34 (33\%) & 25 (30\%) & 31 (33\%) & 37 (31\%) & 23 (32\%) & 32 (34\%) & 35 (37\%) \\ 
##   16 & 44 (44\%) & 44 (46\%) & 51 (54\%) & 50 (49\%) & 40 (48\%) & 45 (47\%) & 59 (49\%) & 29 (40\%) & 42 (45\%) & 40 (43\%) \\ 
##   Biological sex &  &  &  &  &  &  &  &  &  &  \\ 
##   Female & 58 (57\%) & 96 (100\%) & 77 (82\%) & 94 (91\%) & 78 (94\%) & 87 (92\%) & 113 (94\%) & 71 (99\%) & 87 (93\%) & 91 (97\%) \\ 
##   Male & 43 (43\%) & 0 (0\%) & 17 (18\%) & 9 (8.7\%) & 5 (6.0\%) & 8 (8.4\%) & 7 (5.8\%) & 1 (1.4\%) & 7 (7.4\%) & 3 (3.2\%) \\ 
##   Sexual orientation &  &  &  &  &  &  &  &  &  &  \\ 
##   Heterosexual & 86 (85\%) & 0 (0\%) & 44 (47\%) & 29 (28\%) & 27 (33\%) & 18 (19\%) & 7 (5.8\%) & 3 (4.2\%) & 0 (0\%) & 0 (0\%) \\ 
##   LGBTQ & 13 (13\%) & 66 (69\%) & 45 (48\%) & 66 (64\%) & 51 (61\%) & 63 (66\%) & 87 (73\%) & 58 (81\%) & 76 (81\%) & 75 (80\%) \\ 
##   Other & 2 (2.0\%) & 30 (31\%) & 5 (5.3\%) & 8 (7.8\%) & 5 (6.0\%) & 14 (15\%) & 26 (22\%) & 11 (15\%) & 18 (19\%) & 19 (20\%) \\ 
##   Language &  &  &  &  &  &  &  &  &  &  \\ 
##   English & 94 (93\%) & 94 (98\%) & 90 (96\%) & 100 (97\%) & 80 (96\%) & 92 (97\%) & 119 (99\%) & 70 (97\%) & 94 (100\%) & 93 (99\%) \\ 
##   Other & 7 (6.9\%) & 2 (2.1\%) & 4 (4.3\%) & 3 (2.9\%) & 3 (3.6\%) & 3 (3.2\%) & 1 (0.8\%) & 2 (2.8\%) & 0 (0\%) & 1 (1.1\%) \\ 
##   Gender identity &  &  &  &  &  &  &  &  &  &  \\ 
##   Non-binary & 0 (0\%) & 91 (95\%) & 0 (0\%) & 2 (1.9\%) & 0 (0\%) & 2 (2.1\%) & 3 (2.5\%) & 3 (4.2\%) & 20 (21\%) & 67 (71\%) \\ 
##   Women/girls & 61 (60\%) & 0 (0\%) & 77 (82\%) & 93 (90\%) & 78 (94\%) & 82 (86\%) & 95 (79\%) & 61 (85\%) & 55 (59\%) & 17 (18\%) \\ 
##   Male/Masculine & 40 (40\%) & 5 (5.2\%) & 17 (18\%) & 8 (7.8\%) & 5 (6.0\%) & 11 (12\%) & 22 (18\%) & 8 (11\%) & 19 (20\%) & 10 (11\%) \\ 
##   Number of challenges &  &  &  &  &  &  &  &  &  &  \\ 
##   0 & 10 (9.9\%) & 20 (21\%) & 27 (29\%) & 4 (3.9\%) & 37 (45\%) & 20 (21\%) & 15 (13\%) & 25 (35\%) & 11 (12\%) & 14 (15\%) \\ 
##   1 & 78 (77\%) & 39 (41\%) & 56 (60\%) & 77 (75\%) & 37 (45\%) & 48 (51\%) & 85 (71\%) & 30 (42\%) & 58 (62\%) & 43 (46\%) \\ 
##   $>$=2 & 13 (13\%) & 37 (39\%) & 11 (12\%) & 22 (21\%) & 9 (11\%) & 27 (28\%) & 20 (17\%) & 17 (24\%) & 25 (27\%) & 37 (39\%) \\ 
##   Type of challenges &  &  &  &  &  &  &  &  &  &  \\ 
##   No impact & 10 (9.9\%) & 20 (21\%) & 27 (29\%) & 4 (3.9\%) & 37 (45\%) & 20 (21\%) & 15 (13\%) & 25 (35\%) & 11 (12\%) & 14 (15\%) \\ 
##   Other & 11 (11\%) & 37 (39\%) & 10 (11\%) & 21 (20\%) & 9 (11\%) & 37 (39\%) & 19 (16\%) & 19 (26\%) & 39 (41\%) & 41 (44\%) \\ 
##   School & 80 (79\%) & 39 (41\%) & 57 (61\%) & 78 (76\%) & 37 (45\%) & 38 (40\%) & 86 (72\%) & 28 (39\%) & 44 (47\%) & 39 (41\%) \\ 
##   Type of coping strategies &  &  &  &  &  &  &  &  &  &  \\ 
##   Combined & 0 (0\%) & 31 (32\%) & 0 (0\%) & 0 (0\%) & 2 (2.4\%) & 1 (1.1\%) & 8 (6.7\%) & 15 (21\%) & 29 (31\%) & 25 (27\%) \\ 
##   No action & 16 (16\%) & 65 (68\%) & 28 (30\%) & 33 (32\%) & 36 (43\%) & 63 (66\%) & 95 (79\%) & 49 (68\%) & 50 (53\%) & 29 (31\%) \\ 
##   Positive & 85 (84\%) & 0 (0\%) & 66 (70\%) & 70 (68\%) & 45 (54\%) & 31 (33\%) & 17 (14\%) & 8 (11\%) & 15 (16\%) & 40 (43\%) \\ 
##   person\_ate &  &  &  &  &  &  &  &  &  &  \\ 
##   -0.0689513014845681 & 101 (100\%) & 96 (100\%) & 94 (100\%) & 103 (100\%) & 83 (100\%) & 95 (100\%) & 120 (100\%) & 72 (100\%) & 94 (100\%) & 94 (100\%) \\ 
##   person\_hte & -0.10 (0.01) & -0.03 (0.01) & -0.09 (0.00) & -0.08 (0.00) & -0.08 (0.00) & -0.07 (0.00) & -0.07 (0.00) & -0.06 (0.00) & -0.06 (0.00) & -0.04 (0.00) \\ 
##   lp\_person & 0.84 (0.03) & 1.17 (0.03) & 0.89 (0.01) & 0.93 (0.01) & 0.94 (0.01) & 0.96 (0.01) & 1.00 (0.01) & 1.01 (0.01) & 1.05 (0.01) & 1.10 (0.02) \\ 
##    \hline
## \end{tabular}
## \end{table}

6 Sensitivity Analysis

6.1 inverse probability of missingness weighting (IPMW)

Check the missingness, and regress the missing status on variables with complete info.

model_dat <- cope_subset3 %>% 
  dplyr::select(-family_num,-b_response_id,-recode_language,-b_screener_age)
## check missingness
model_dat$missing[complete.cases(model_dat)] <- 0
model_dat$missing[!complete.cases(model_dat)] <- 1

model_dat$missing %>% table()
## .
##    0    1 
## 1441   60
str(model_dat)
## tibble [1,501 × 10] (S3: tbl_df/tbl/data.frame)
##  $ condition    : chr [1:1501] "Placebo Control" "Project Personality" "Placebo Control" "Placebo Control" ...
##  $ b_cdi_mean   : num [1:1501] 1.417 0.917 1.167 1.333 0.917 ...
##  $ f1_cdi_mean  : num [1:1501] 1 0.583 0.583 1.417 0.5 ...
##  $ recode_race  : Factor w/ 5 levels "Asian Including Asian Desi",..: 5 5 5 5 5 5 1 1 5 4 ...
##  $ b_dem_sex    : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ recode_orient: Factor w/ 3 levels "Heterosexual",..: 2 3 2 2 2 2 1 2 1 2 ...
##  $ genderid3    : Factor w/ 3 levels "1","2","3": 1 1 2 2 2 3 2 2 2 2 ...
##  $ family_cat   : Factor w/ 3 levels "No impact","Other",..: 2 2 1 3 3 3 3 3 3 2 ...
##  $ cope3        : Factor w/ 3 levels "1","2","3": 3 3 3 2 1 2 2 2 2 2 ...
##  $ missing      : num [1:1501] 0 0 0 0 0 0 0 0 0 0 ...
missed <- glm(missing ~ genderid3+b_cdi_mean+family_cat+cope3,
              data = model_dat, family = "binomial")

model_dat$fitted = missed$fitted.values
model_dat$IPCW = (1-mean(model_dat$missing))/(1-model_dat$fitted)
model_dat <- subset(model_dat, complete.cases(model_dat))

A summary of the IPCW model:

summary(missed)
## 
## Call:
## glm(formula = missing ~ genderid3 + b_cdi_mean + family_cat + 
##     cope3, family = "binomial", data = model_dat)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -3.0919     0.8430  -3.668 0.000245 ***
## genderid32        -1.2368     0.2870  -4.309 1.64e-05 ***
## genderid33        -1.2284     0.4630  -2.653 0.007978 ** 
## b_cdi_mean         0.2561     0.4193   0.611 0.541408    
## family_catOther   -0.1730     0.3659  -0.473 0.636328    
## family_catSchool  -0.5263     0.3270  -1.610 0.107490    
## cope32             0.9205     0.5452   1.688 0.091346 .  
## cope33             0.7123     0.5729   1.243 0.213729    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 503.91  on 1500  degrees of freedom
## Residual deviance: 476.46  on 1493  degrees of freedom
## AIC: 492.46
## 
## Number of Fisher Scoring iterations: 6

Summary of IPCW weights:

summary(model_dat$IPCW)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9687  0.9815  0.9878  1.0000  1.0002  1.1427

6.2 Baseline prediction models

ipcw_abc_dat <- model_dat %>% filter(condition!="Project Personality") %>%## ABC vs control
  mutate(condition = as.character(condition),
         condition = as.factor(condition)
  )%>% dplyr::select(-condition,-b_cdi_mean,-fitted,-IPCW)


ipcw_person_dat <- model_dat %>% 
filter(condition!="Project ABC") %>%  ## Personality vs control
 mutate(condition = as.character(condition),
        condition = as.factor(condition)
        )%>% dplyr::select(-condition,-b_cdi_mean,-fitted,-IPCW)


abc_weight <- ipcw_abc_dat$IPCW
person_weight <- ipcw_person_dat$IPCW

abc_ipcw_base_m <- glm(f1_cdi_mean ~., 
             family =gaussian(link="identity"),
             weights =abc_weight,
             data = ipcw_abc_dat)


person_ipcw_base_m <- glm(f1_cdi_mean ~., 
             family =gaussian(link="identity"),
             weights =person_weight,
             data = ipcw_person_dat)
## baseline linear pred values

abc_ipcw_lp <- predict(abc_ipcw_base_m, newdata = ipcw_abc_dat)

person_ipcw_lp <- predict(person_ipcw_base_m, newdata = ipcw_person_dat)
lp_abc <- abc_ipcw_lp
lp_person <- person_ipcw_lp

baseline model for Project ABC:

tab_model(abc_ipcw_base_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 1.06 0.92 – 1.21 <0.001
recode race
[Black/African-American]
-0.04 -0.16 – 0.08 0.527
recode race
[Hispanic/Latinx]
-0.08 -0.19 – 0.02 0.128
recode race [Mixed] -0.03 -0.13 – 0.08 0.617
recode race [White] 0.03 -0.05 – 0.12 0.462
b dem sex [Male] -0.15 -0.25 – -0.04 0.006
recode orient [LGBTQ] 0.09 0.02 – 0.16 0.010
recode orient [Other] 0.12 0.03 – 0.22 0.008
genderid3 [2] -0.13 -0.20 – -0.06 <0.001
genderid3 [3] -0.02 -0.12 – 0.08 0.687
family cat [Other] 0.07 -0.01 – 0.15 0.070
family cat [School] 0.03 -0.03 – 0.10 0.313
cope3 [2] -0.06 -0.14 – 0.02 0.137
cope3 [3] -0.16 -0.24 – -0.08 <0.001
Observations 977
R2 0.079

baseline model for Project personality:

tab_model(person_ipcw_base_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 1.16 1.00 – 1.31 <0.001
recode race
[Black/African-American]
-0.06 -0.18 – 0.07 0.387
recode race
[Hispanic/Latinx]
-0.05 -0.16 – 0.06 0.382
recode race [Mixed] -0.03 -0.14 – 0.08 0.552
recode race [White] -0.00 -0.09 – 0.09 0.937
b dem sex [Male] -0.11 -0.22 – 0.01 0.065
recode orient [LGBTQ] 0.07 0.00 – 0.14 0.046
recode orient [Other] 0.06 -0.03 – 0.16 0.182
genderid3 [2] -0.15 -0.22 – -0.08 <0.001
genderid3 [3] -0.10 -0.21 – -0.00 0.049
family cat [Other] 0.02 -0.06 – 0.10 0.625
family cat [School] -0.02 -0.09 – 0.05 0.664
cope3 [2] -0.06 -0.15 – 0.03 0.193
cope3 [3] -0.13 -0.22 – -0.04 0.005
Observations 952
R2 0.052

6.3 Main effect model

ipcw_abc_dat <- model_dat %>% filter(condition!="Project Personality") %>%## ABC vs control
  mutate(condition = as.character(condition),
         condition = as.factor(condition)
  )%>% dplyr::select(-fitted,-IPCW)


ipcw_person_dat <- model_dat %>% 
filter(condition!="Project ABC") %>%  ## Personality vs control
 mutate(condition = as.character(condition),
        condition = as.factor(condition)
        )%>% dplyr::select(-fitted,-IPCW)



abc_ipcw_main_m <- glm(f1_cdi_mean ~ b_cdi_mean + condition, 
             family =gaussian(link="identity"),
             weights =abc_weight,
             data = ipcw_abc_dat)



person_ipcw_main_m <- glm(f1_cdi_mean ~b_cdi_mean + condition , 
             family =gaussian(link="identity"),
             weights =person_weight,
             data = ipcw_person_dat)

Main effect model for Project ABC:

tab_model(abc_ipcw_main_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.27 0.19 – 0.36 <0.001
b cdi mean 0.64 0.58 – 0.71 <0.001
condition [Project ABC] -0.08 -0.12 – -0.04 <0.001
Observations 977
R2 0.292

Main effect model for Project Personality:

tab_model(person_ipcw_main_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.31 0.23 – 0.39 <0.001
b cdi mean 0.61 0.55 – 0.68 <0.001
condition [Project
Personality]
-0.07 -0.11 – -0.02 0.003
Observations 952
R2 0.266

6.4 The HTE models

abc_ipcw_hte_m <- glm(f1_cdi_mean ~b_cdi_mean + condition * abc_ipcw_lp, 
             family =gaussian(link="identity"),
             weights =abc_weight,
             data = ipcw_abc_dat)



person_ipcw_hte_m <- glm(f1_cdi_mean ~b_cdi_mean + condition * person_ipcw_lp, 
             family =gaussian(link="identity"),
             weights =person_weight,
             data = ipcw_person_dat)

HTE model for Project ABC:

tab_model(abc_ipcw_hte_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.21 -0.05 – 0.47 0.120
b cdi mean 0.60 0.53 – 0.67 <0.001
condition [Project ABC] -0.46 -0.83 – -0.10 0.013
abc ipcw lp 0.12 -0.15 – 0.39 0.393
condition [Project ABC] ×
abc ipcw lp
0.39 0.02 – 0.77 0.039
Observations 977
R2 0.302

HTE model for Project Personality:

tab_model(person_ipcw_hte_m)
  f 1 cdi mean
Predictors Estimates CI p
(Intercept) 0.19 -0.14 – 0.52 0.258
b cdi mean 0.59 0.52 – 0.66 <0.001
condition [Project
Personality]
-0.28 -0.76 – 0.19 0.242
person ipcw lp 0.15 -0.19 – 0.49 0.383
condition [Project
Personality] × person
ipcw lp
0.22 -0.26 – 0.70 0.373
Observations 952
R2 0.270
## rename variabels 

abc_main_m <- abc_ipcw_main_m

person_main_m <- person_ipcw_main_m

abc_hte_m <- abc_ipcw_hte_m
  
person_hte_m <- person_ipcw_hte_m
  
  

## cATE/HTE
## project ABC vs control
hte_dat <- ipcw_abc_dat

hte_dat$condition = 'Placebo Control'
abc_hte_ctrl <- predict(abc_hte_m,hte_dat)

hte_dat$condition  = 'Project ABC'
abc_hte_trt <- predict(abc_hte_m,hte_dat)

abc_hte <- abc_hte_trt-abc_hte_ctrl

## hte personality vs control
hte_dat <- ipcw_person_dat

hte_dat$condition = 'Placebo Control'
person_hte_ctrl <- predict(person_hte_m,hte_dat)

hte_dat$condition  = 'Project Personality'
person_hte_trt <- predict(person_hte_m,hte_dat)
person_hte <- person_hte_trt-person_hte_ctrl


point_df <- data.frame(Comparsion= c("Project ABC vs Control","Project personality vs Control" ),
                       ATE = c(abc_ate,person_ate),
                       SE = c(summary(abc_main_m)$coefficients[, "Std. Error"][3],
                              summary(person_main_m)$coefficients[, "Std. Error"][3])
                       )

kable(point_df, caption = "summary of avearged treatment effect")
Table 6.1: summary of avearged treatment effect
Comparsion ATE SE
conditionProject ABC Project ABC vs Control -0.0795347 0.0223413
conditionProject Personality Project personality vs Control -0.0689513 0.0233696

6.5 Risk stratification

## calibration plot

abc_ate <- coef(abc_main_m)[3]
person_ate <- coef(person_main_m)[3]


person_cal_dat <- data.frame(person_ate,
                          person_hte,
                          lp_person)

person_cal_dat$quantile_grp <- cut(person_cal_dat$lp_person,
                               breaks = quantile(person_cal_dat$lp_person, probs = seq(0, 1, by = 0.2)),
                               include.lowest = TRUE,
                               labels = FALSE)

## compute the averaged hte effect in each quantile risk grp
person_avg_hte <- person_cal_dat %>%
  group_by(quantile_grp) %>%
  summarise(person_avg_hte = mean(person_hte, na.rm = TRUE))

6.6 Boot CI

## for ipcw data
## ipcw_person_dat
boot2_f <- function(mydat=NULL,seed=1017){
  set.seed(seed)
  abc_boot_res <- NULL
  for (i in 1:1000){
    boot.idx <- sample(1:dim(mydat)[1], size = dim(mydat)[1], replace = T)
    boot.data <- mydat[boot.idx,]
    
    abc_main_dat <- boot.data %>%  dplyr::select(f1_cdi_mean,condition,b_cdi_mean)
    
    abc_main_m <- lm(f1_cdi_mean~ b_cdi_mean + condition,data=abc_main_dat)
    
    lp_abc <- predict(abc_main_m,boot.data)
    
    abc_ate <- coef(abc_main_m)[3]
    
    abc_hte_m <- glm(f1_cdi_mean ~b_cdi_mean + condition*lp_abc, 
             family =gaussian(link="identity"),
             weights =boot.data$IPCW,
             data = boot.data)
    
    ## cATE/HTE
    ## project ABC vs control
    hte_dat <- boot.data
    
    trt_levels <- unique(boot.data$condition) %>% as.character()
    
    trt <- trt_levels[!(trt_levels %in% 'Placebo Control')]
    
    hte_dat$condition = 'Placebo Control'
    abc_hte_ctrl <- predict(abc_hte_m,hte_dat)
    
    hte_dat$condition  = trt 
    abc_hte_trt <- predict(abc_hte_m,hte_dat)
    
    abc_hte <- abc_hte_trt-abc_hte_ctrl
    
    
    
    ## calibration plot
    abc_cal_dat <- data.frame(abc_ate,
                              abc_hte,
                              lp_abc)
    
    abc_cal_dat$quantile_grp <- cut(abc_cal_dat$lp_abc,
                                    breaks = quantile(abc_cal_dat$lp_abc, probs = seq(0, 1, by = 0.2)),
                                    include.lowest = TRUE,
                                    labels = FALSE)
    
    ## compute the averaged hte effect in each quantile risk grp
    avg_hte <- abc_cal_dat %>%
      group_by(quantile_grp) %>%
      summarise(avg_hte = mean(abc_hte, na.rm = TRUE))
    
    abc_boot_res <- rbind(abc_boot_res,avg_hte)
  }
  return(abc_boot_res)
}

For Project ABC:

## compute bootstrapped CI
abc_boot_res <- boot2_f(mydat = ipcw_abc_dat)
abc_boot_cali_ci <- abc_boot_res %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )
abc_cali_result <- left_join(abc_avg_hte,abc_boot_cali_ci)

For Project personality:

person_boot_res <- boot2_f(mydat = dat_person)

person_boot_cali_ci <- person_boot_res %>%
  group_by(quantile_grp) %>%
  summarise(
    lower_quantile = round(quantile(avg_hte, probs = 0.025), 4),
    upper_quantile = round(quantile(avg_hte, probs = 0.975), 4)
  )


person_cali_result <- left_join(person_avg_hte,person_boot_cali_ci)

## identify the bounds for cali plots

max_cali_y <- max(abc_cali_result$upper_quantile,person_cali_result$upper_quantile) 

min_cali_y <- min(abc_cali_result$lower_quantile,person_cali_result$lower_quantile) 

## check if the ylims are correct 
### I used -0.2 and 0.5 range for the y axis in the calibration plots

logic1 <- min_cali_y <=  0.05 & min_cali_y >=-0.2

logic2 <- max_cali_y <=  0.05 & max_cali_y >=-0.2

if(!logic1 & ! logic2){
  print(paste0("please make sure the range of y axis in the following calibration plot is [", min_cali_y, ",",max_cali_y,']'))
}else{logic_3 = logic1+logic2}

6.7 Risk stratification

abc_cali_plot <- ggplot(abc_cali_result, aes(x = as.factor(quantile_grp), y = abc_avg_hte)) +
  geom_point(size = 2.5,color = "#8B0000") +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.05, color = "#8B0000") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
  ) +
  geom_hline(yintercept = abc_ate, linetype = "dashed", color = "grey")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, 0.05), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
 # ylim(c(min_cali_y,max_cali_y))+
  ylab("Mean difference")+
  # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))
abc_lp_plot <- ggplot(abc_cal_dat, aes(x = lp_abc)) +
  geom_histogram(binwidth = 0.02, fill = "#8B0000") +
  labs(x = "Predicted CDI mean score using baseline covariates (Project ABC)", y = NULL) +
  theme_minimal()

abc_combo_plot <- cowplot::plot_grid(
  abc_cali_plot,
  abc_lp_plot,
  ncol = 1,
  align = "v",
  rel_heights = c(3, 1)  
)



# Display the combined plot
abc_combo_plot

abc_cal_dat$lp_abc %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6620  0.8980  0.9743  0.9782  1.0605  1.2931
person_cali_plot <- ggplot(person_cali_result, aes(x = as.factor(quantile_grp), y = person_avg_hte)) +
  geom_point(size = 2.5, color = "#8B0000") +
  # geom_line(aes(group = 1), color = "blue") +
  geom_errorbar(aes(ymin = lower_quantile, ymax = upper_quantile), width = 0.05, color = "#8B0000") +
  labs(x = "",
       y = "Mean heter_mean_diff"
       # title = ""
       ) +
  geom_hline(yintercept = person_ate, linetype = "dashed", color = "grey")+
  theme_minimal()+
  scale_y_continuous(
    limits = c(-0.2, 0.05), 
    breaks = seq(-0.2, 0.05, by = 0.05)
  )+
  ylab("Mean difference")+
 # ylab(expression(atop("Risk Difference, %", atop("Harm" %<-% phantom("     ") %->% "Benefit")))) +  # Custom y-axis label
  theme(axis.title.y = element_text(angle = 90, vjust = 0.5, size = 14))
person_lp_plot <- ggplot(person_cal_dat, aes(x = lp_person)) +
  geom_histogram(binwidth = 0.01, fill = "#8B0000") +
  labs(x = "Predicted CDI mean score using baseline covariates(Project Personality)", y = NULL) +
  theme_minimal()

person_combo_plot <- cowplot::plot_grid(
  person_cali_plot,
  person_lp_plot,
  ncol = 1,
  align = "v",
  rel_heights = c(3, 1)  
)

person_combo_plot

cal_plots <- ggarrange(abc_combo_plot, person_combo_plot, ncol = 2, nrow = 1)
cal_plots

ggsave("figures/cal_plots_ipcw.png", width = 10, height = 7, dpi = 600)

Distribution of baseline depression severity score

abc_cal_dat_report <- abc_cal_dat

person_cal_dat_report <- person_cal_dat


abc_cal_dat_report$grp ="Project ABC"

person_cal_dat_report$grp = "Project Personality"

colnames(person_cal_dat_report) = colnames(abc_cal_dat_report)

combined_cal_dat_report<- rbind(abc_cal_dat_report,person_cal_dat_report)


combined_hist_plot <-ggplot(combined_cal_dat_report, aes(x = lp_abc, fill = grp)) +
  geom_histogram(binwidth = 0.03, alpha = 0.8, position = "identity") +
  labs(x = "Predicted 3-month CDI-SF mean score using baseline prediction models", y = "Frequency", fill = "Project") +
  theme_minimal() +
  scale_fill_manual(values = c("Project ABC" = "#1f78b4", "Project Personality" = "#8B0000"))

combined_hist_plot

ggsave("figures/combined_hist_plot_ipcw.png", width = 10, height = 7, dpi = 600)